diff --git a/Regular+Expressions+Practice.html b/Regular+Expressions+Practice.html new file mode 100755 index 0000000..6e13b35 --- /dev/null +++ b/Regular+Expressions+Practice.html @@ -0,0 +1,11953 @@ + + +
+a.) Military times after noon
+ +import numpy
+import pandas
+import re
+
+Times=['15:30', '12:00', '2:00', '21:54', '01:69', '01:20', '6:00', '30:00', '14:50','12:01']
+
+strtimes=' '.join(Times)
+
+MT=re.compile('1[0-9]:[0-5][0-9]|2[0-3]:[0-5][0-9]')
+
+militarytimes=re.findall(MT,strtimes)
+print(militarytimes)
+The regex here is defined as the variable 'MT'. The first half (before the |) describes the situation in which the time begins with a 1 while the second half describes the situation in which the time begins with a 2. Combining these two halfs lets the regex search for all times that include and come after 12:00 without matching any fake times or times before noon.
+ +b.) Correctly formatted genus and species
+ +Species=['H. Sapien', 'Homo sapien', 'H. sapien', 'D. melanogaster', 'D. Melanogaster', 'Drosophila melanogaster']
+
+strspecies=' '.join(Species)
+
+sregex=re.compile('[A-Z]\. [a-z]+')
+
+slist=re.findall(sregex,strspecies)
+print(slist)
+The regex, sregex, utilizes the escape character to ensure it searches for a period after the capital letter. The plus next to the "all lowercase letters" set ensures that the regex will find any species names (correctly written in all lowercase) no matter how long.
+ +c.) Social Security Number
+ +socialsecurity=['555-55-5555', '123-45-6789', '55555-55555', '1234567890']
+
+strssn=' '.join(socialsecurity)
+
+ssnregex=re.compile('[0-9]{3}\-[0-9]{2}\-[0-9]{4}')
+
+ssnlist=re.findall(ssnregex,strssn)
+print(ssnlist)
+The curly brackets in the regex, ssnregex, tell the regex to search for 3 digits in a row, then 2 digits in a row, and finally 4 digits in a row. The escape character forces the regex to search for litteral hyphens instead of some other operator.
+ +Hi, this is a tutorial on how to code a t-test using maximum likelihood in Python. This tutorial will explain how to create separate negative log-likelihood functions for the null and alternate model as well as produce a t-test that compares these models.
+Before we jump into the code, we need to define the statistical models for the t-test. The two models are the null model and the alternative model.
+In the null model of a t-test, we hypothesize that there is no difference between the means of two populations. We describe the model using the equation:
+$$y = \beta_0 + ε$$In the alternative model, we hypothesize that there is a significance between the means.
+$$y = \beta_0 + \beta_1x + ε$$ +import pandas
+import numpy
+import scipy.stats
+from scipy.optimize import minimize
+from plotnine import *
+
+#Import our data set
+data = pandas.read_csv("chickwts.txt")
+
+#Graph a summary plot of means for each seed type
+ggplot(data=data) + geom_bar(aes(x='factor(feed)', y='weight'), stat = "summary", fun_y = numpy.mean) + theme_classic() + xlab("feed type") + ylab("mean weight")
+
+#A function to return the negative log-likelihood for a given observation
+def nllike (p, obs):
+ B0 = p[0]
+ B1 = p[1]
+ sigma = p[2]
+ expected = B0 + B1 * obs.x
+ nll = -1 * scipy.stats.norm(expected, sigma).logpdf(obs.y).sum()
+ return nll
+
+#A function to return the negative log-likelihood for the null model
+def nllike_null (p, obs):
+ B0 = p[0]
+ sigma = p[1]
+ expected = B0
+ nll = -1 * scipy.stats.norm(expected, sigma).logpdf(obs.y).sum()
+ return nll
+
+#function for returning p-value for t-test
+def ttest (data, group1, group2, x, y):
+ #define a temporary slice of the data
+ temp_df = data[(data[x]==group1) | (data[x]==group2)]
+
+ #Make new column 'x' set as 0 or 1 based on group
+ temp_df["x"] = 0
+ temp_df["x"][temp_df[x] == group2] = 1
+ temp_df["y"] = temp_df[y]
+
+ # y = B0 + B1*x + E
+ model = minimize(nllike, [1, 1, 1], method = "Nelder-Mead", options={'disp': True}, args = temp_df)
+
+ # y = B0 + E
+ null_model = minimize(nllike_null, [1, 1], method = "Nelder-Mead", options={'disp': True}, args = temp_df)
+
+ #Get differences in fit
+ D = (null_model.fun - model.fun) * 2
+
+ #Use chi3.sf() for returning p-value
+ p = scipy.stats.chi2.sf(D,1)
+
+ #Print results
+ print ("-----------------------------")
+ print (group1 + " vs. " + group2)
+ print ("p-value = " + str(p))
+ if p <= 0.05:
+ print ("Significance")
+ else:
+ print ("No significance")
+ print ("-----------------------------")
+
+#Perform t-tests
+ttest(data, group1="horsebean", group2="casein", x="feed", y="weight")
+We import pandas, numpy, scipy.stats, minimize from scipy.optimize, and all functions from plotnine (this is different than import plotnine).
+We then use pandas.read_csv() to import a text file as a pandas Dataframe.
+ +import pandas
+import numpy
+import scipy.stats
+from scipy.optimize import minimize
+from plotnine import *
+
+#Import our data set
+data = pandas.read_csv("chickwts.txt")
+We can check whether the data have been loaded. The dataframe contains 70 rows of feed types and the chick weights produced from that type of feed.
+ +print data
+We can visualize the data using the following line of code, which creates a bar graph of mean chick weights by feed type.
+ +#Graph a summary plot of means for each feed type
+ggplot(data=data) + geom_bar(aes(x='factor(feed)', y='weight'), stat = "summary", fun_y = numpy.mean) + theme_classic() + xlab("feed type") + ylab("mean weight")
+Here, we'll be coding two functions to calculate the negative loglikelihood for a given observation for each of our models. +Remember, the equation for the null model is: +$$y = \beta_0 + ε$$
+The equation for the alternative model is:
+$$y = \beta_0 + \beta_1x + ε$$We'll start with the null model.
+First define the function
+ +def nllike_null (p, obs):
+Now we will unpack the parameters
+Here, p is a list of two terms: the intercept (β0) and the error (ε)
+ +def nllike_null (p, obs):
+ B0 = p[0]
+ sigma = p[1]
+We calculate the expected value (y).
+Where's the error in the formula? Stick around to find out!
+ +def nllike_null (p, obs):
+ B0 = p[0]
+ sigma = p[1]
+ expected = B0
+Then we calculate the negative loglikelihood (nll)
+The error sigma pops back in this equation. We then return nll.
+ +def nllike_null (p, obs):
+ B0 = p[0]
+ sigma = p[1]
+ expected = B0
+ nll = -1 * scipy.stats.norm(expected, sigma).logpdf(obs.y).sum()
+ return nll
+Our completed function for the negative loglikelihood for the null model:
+ +#A function to return the negative log-likelihood for the null model
+def nllike_null (p, obs):
+ B0 = p[0]
+ sigma = p[1]
+ expected = B0
+ nll = -1 * scipy.stats.norm(expected, sigma).logpdf(obs.y).sum()
+ return nll
+We then create a similar function for the alternative model:
+Notice that we have more parameters to unpack as well as another term in the expected variable. Why B1 obs.x?*
+ +#A function to return the negative log-likelihood for a given observation
+def nllike (p, obs):
+ B0 = p[0]
+ B1 = p[1]
+ sigma = p[2]
+ expected = B0 + B1 * obs.x
+ nll = -1 * scipy.stats.norm(expected, sigma).logpdf(obs.y).sum()
+ return nll
+Finally, we get to the actual test. We will create a function with four parameters.
+ +data will be our dataset
+group1 will be the feed type compared
+group2 will be the second feed type compared
+x is the independent categorical variable, i.e. "feed"
+y is the response variable, i.e. "weight"
+
+#function for returning p-value for t-test
+def ttest (data, group1, group2, x, y):
+We'll just subset the data, named temp_df of the feed types that we're only interested
+ +#function for returning p-value for t-test
+def ttest (data, group1, group2, x, y):
+ #define a temporary slice of the data
+ temp_df = data[(data[x]==group1) | (data[x]==group2)]
+We'll create a new column called x in temp_df called x with the values as 0 or 1 depending on the feed type of the row.
+We then make another new column called y that's just a copy of the response column
+ +#function for returning p-value for t-test
+def ttest (data, group1, group2, x, y):
+ #define a temporary slice of the data
+ temp_df = data[(data[x]==group1) | (data[x]==group2)]
+
+ #Make new column 'x' set as 0 or 1 based on group
+ temp_df["x"] = 0
+ temp_df["x"][temp_df[x] == group2] = 1
+ temp_df["y"] = temp_df[y]
+We use minimize in the scipy package to calculate the fit for the two models.
+ +#function for returning p-value for t-test
+def ttest (data, group1, group2, x, y):
+ #define a temporary slice of the data
+ temp_df = data[(data[x]==group1) | (data[x]==group2)]
+
+ #Make new column 'x' set as 0 or 1 based on group
+ temp_df["x"] = 0
+ temp_df["x"][temp_df[x] == group2] = 1
+ temp_df["y"] = temp_df[y]
+
+ # y = B0 + B1*x + E
+ model = minimize(nllike, [1, 1, 1], method = "Nelder-Mead", options={'disp': True}, args = temp_df)
+
+ # y = B0 + E
+ null_model = minimize(nllike_null, [1, 1], method = "Nelder-Mead", options={'disp': True}, args = temp_df)
+To compute the p-value of the t-test
+ +#function for returning p-value for t-test
+def ttest (data, group1, group2, x, y):
+ #define a temporary slice of the data
+ temp_df = data[(data[x]==group1) | (data[x]==group2)]
+
+ #Make new column 'x' set as 0 or 1 based on group
+ temp_df["x"] = 0
+ temp_df["x"][temp_df[x] == group2] = 1
+ temp_df["y"] = temp_df[y]
+
+ # y = B0 + B1*x + E
+ model = minimize(nllike, [1, 1, 1], method = "Nelder-Mead", options={'disp': True}, args = temp_df)
+
+ # y = B0 + E
+ null_model = minimize(nllike_null, [1, 1], method = "Nelder-Mead", options={'disp': True}, args = temp_df)
+
+ #Get differences in fit
+ D = (null_model.fun - model.fun) * 2
+
+ #Use chi3.sf() for returning p-value
+ p = scipy.stats.chi2.sf(D,1)
+Finally, we display the output in a nicely formated way.
+ +#function for returning p-value for t-test
+def ttest (data, group1, group2, x, y):
+ #define a temporary slice of the data
+ temp_df = data[(data[x]==group1) | (data[x]==group2)]
+
+ #Make new column 'x' set as 0 or 1 based on group
+ temp_df["x"] = 0
+ temp_df["x"][temp_df[x] == group2] = 1
+ temp_df["y"] = temp_df[y]
+
+ # y = B0 + B1*x + E
+ model = minimize(nllike, [1, 1, 1], method = "Nelder-Mead", options={'disp': True}, args = temp_df)
+
+ # y = B0 + E
+ null_model = minimize(nllike_null, [1, 1], method = "Nelder-Mead", options={'disp': True}, args = temp_df)
+
+ #Get differences in fit
+ D = (null_model.fun - model.fun) * 2
+
+ #Use chi3.sf() for returning p-value
+ p = scipy.stats.chi2.sf(D,1)
+
+ #Print results
+ print ("-----------------------------")
+ print (group1 + " vs. " + group2)
+ print ("p-value = " + str(p))
+ if p <= 0.05:
+ print ("Significance")
+ else:
+ print ("No significance")
+ print ("-----------------------------")
+#Perform t-tests
+ttest(data, group1="horsebean", group2="casein", x="feed", y="weight")
+import pandas
+import numpy
+import scipy.stats
+from scipy.optimize import minimize
+from plotnine import *
+
+data = pandas.read_csv("chickwts.txt")
+
+ggplot(data=data) + geom_bar(aes(x='factor(feed)', y='weight'), stat = "summary", fun_y = numpy.mean) + theme_classic() + xlab("feed type") + ylab("mean weight")
+
+#function for returning negative log likelihood for t-test model
+def nllike (p, obs):
+ B0 = p[0]
+ B1 = p[1]
+ sigma = p[2]
+ expected = B0 + B1 * obs.x
+ nll = -1 * scipy.stats.norm(expected, sigma).logpdf(obs.y).sum()
+ return nll
+
+#function for returning negative log likelihood for null model
+def nllike_null (p, obs):
+ B0 = p[0]
+ sigma = p[1]
+ expected = B0
+ nll = -1 * scipy.stats.norm(expected, sigma).logpdf(obs.y).sum()
+ return nll
+
+#function for returning p-value for t-test
+def ttest (data, group1, group2, x, y):
+ #define a temporary slice of the data
+ temp_df = data[(data[x]==group1) | (data[x]==group2)]
+
+ #Make new column 'x' set as 0 or 1 based on group
+ temp_df["x"] = 0
+ temp_df["x"][temp_df[x] == group2] = 1
+ temp_df["y"] = temp_df[y]
+
+ # y = B0 + B1*x + E
+ model = minimize(nllike, [1, 1, 1], method = "Nelder-Mead", options={'disp': True}, args = temp_df)
+
+ # y = B0 + E
+ null_model = minimize(nllike_null, [1, 1], method = "Nelder-Mead", options={'disp': True}, args = temp_df)
+
+ #Get differences in fit
+ D = (null_model.fun - model.fun) * 2
+
+ #Use chi3.sf() for returning p-value
+ p = scipy.stats.chi2.sf(D,1)
+
+ #Print results
+ print ("-----------------------------")
+ print (group1 + " vs. " + group2)
+ print ("p-value = " + str(p))
+ if p <= 0.05:
+ print ("Significance")
+ else:
+ print ("No significance")
+ print ("-----------------------------")
+
+#Perform t-tests
+ttest(data, group1="horsebean", group2="casein", x="feed", y="weight")
+