Astr480 · Higgins00 · Jun 3, 2019 · Jun 5, 2019
diff --git a/MichaelHiggins/MachineLearning.ipynb b/MichaelHiggins/MachineLearning.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.svm import LinearSVC\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.model_selection import GridSearchCV\n",
+    "from sklearn.datasets import load_files\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn import metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "n_samples: 2000\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/astro/classes/Astro_300/miniconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n",
+      "  warnings.warn(CV_WARNING, FutureWarning)\n",
+      "/astro/classes/Astro_300/miniconda/lib/python3.6/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
+      "  \"the number of iterations.\", ConvergenceWarning)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0 params - {'vect__ngram_range': (1, 1)}; mean - 0.82; std - 0.01\n",
+      "1 params - {'vect__ngram_range': (1, 2)}; mean - 0.83; std - 0.00\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "         neg       0.89      0.87      0.88       243\n",
+      "         pos       0.88      0.90      0.89       257\n",
+      "\n",
+      "    accuracy                           0.89       500\n",
+      "   macro avg       0.89      0.89      0.89       500\n",
+      "weighted avg       0.89      0.89      0.89       500\n",
+      "\n",
+      "[[211  32]\n",
+      " [ 25 232]]\n",
+      "1500\n"
+     ]
+    }
+   ],
+   "source": [
+    "if __name__ == \"__main__\":\n",
+    "    # NOTE: we put the following in a 'if __name__ == \"__main__\"' protected\n",
+    "    # block to be able to use a multi-core grid search that also works under\n",
+    "    # Windows, see: http://docs.python.org/library/multiprocessing.html#windows\n",
+    "    # The multiprocessing module is used as the backend of joblib.Parallel\n",
+    "    # that is used when n_jobs != 1 in GridSearchCV\n",
+    "\n",
+    "    # the training data folder must be passed as first argument\n",
+    "    movie_reviews_data_folder = sys.argv[1]\n",
+    "    dataset = load_files('scikit-learn/doc/tutorial/text_analytics/data/movie_reviews/txt_sentoken', shuffle=False)\n",
+    "    print(\"n_samples: %d\" % len(dataset.data))\n",
+    "\n",
+    "    # split the dataset in training and test set:\n",
+    "    docs_train, docs_test, y_train, y_test = train_test_split(\n",
+    "        dataset.data, dataset.target, test_size=0.25, random_state=None)\n",
+    "\n",
+    "     # TASK: Build a vectorizer / classifier pipeline that filters out tokens\n",
+    "    # that are too rare or too frequent\n",
+    "    pipeline = Pipeline([\n",
+    "        ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),\n",
+    "        ('clf', LinearSVC(C=1000)),\n",
+    "    ])\n",
+    "\n",
+    "    # TASK: Build a grid search to find out whether unigrams or bigrams are\n",
+    "    # more useful.\n",
+    "    # Fit the pipeline on the training set using grid search for the parameters\n",
+    "    parameters = {\n",
+    "        'vect__ngram_range': [(1, 1), (1, 2)],\n",
+    "    }\n",
+    "    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)\n",
+    "    grid_search.fit(docs_train, y_train)\n",
+    "\n",
+    "    # TASK: print the mean and std for each candidate along with the parameter\n",
+    "    # settings for all the candidates explored by grid search.\n",
+    "    n_candidates = len(grid_search.cv_results_['params'])\n",
+    "    for i in range(n_candidates):\n",
+    "        print(i, 'params - %s; mean - %0.2f; std - %0.2f'\n",
+    "                 % (grid_search.cv_results_['params'][i],\n",
+    "                    grid_search.cv_results_['mean_test_score'][i],\n",
+    "                    grid_search.cv_results_['std_test_score'][i]))\n",
+    "\n",
+    "    # TASK: Predict the outcome on the testing set and store it in a variable\n",
+    "    # named y_predicted\n",
+    "    y_predicted = grid_search.predict(docs_test)\n",
+    "\n",
+    "    # Print the classification report\n",
+    "    print(metrics.classification_report(y_test, y_predicted,\n",
+    "                                        target_names=dataset.target_names))\n",
+    "\n",
+    "    # Print and plot the confusion matrix\n",
+    "    cm = metrics.confusion_matrix(y_test, y_predicted)\n",
+    "    print(cm)\n",
+    "\n",
+    "    print (len(y_train))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(2000, 39659)"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "count_vect = CountVectorizer()\n",
+    "y_train_counts = count_vect.fit_transform(dataset.data)\n",
+    "y_train_counts.shape"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/MichaelHiggins/scikit-learn/CONTRIBUTING.md b/MichaelHiggins/scikit-learn/CONTRIBUTING.md
@@ -0,0 +1,43 @@
+
+Contributing to scikit-learn
+============================
+
+There are many ways to contribute to scikit-learn, with the most common ones
+being contribution of code or documentation to the project. Improving the
+documentation is no less important than improving the library itself. If you
+find a typo in the documentation, or have made improvements, do not hesitate to
+send an email to the mailing list or preferably submit a GitHub pull request.
+Documentation can be found under the
+[doc/](https://github.com/scikit-learn/scikit-learn/tree/master/doc) directory.
+
+But there are many other ways to help. In particular answering queries on the
+[issue tracker](https://github.com/scikit-learn/scikit-learn/issues),
+investigating bugs, and [reviewing other developers' pull
+requests](http://scikit-learn.org/dev/developers/contributing.html#code-review-guidelines)
+are very valuable contributions that decrease the burden on the project
+maintainers.
+
+Another way to contribute is to report issues you're facing, and give a "thumbs
+up" on issues that others reported and that are relevant to you. It also helps
+us if you spread the word: reference the project from your blog and articles,
+link to it from your website, or simply star it in GitHub to say "I use it".
+
+Guideline
+---------
+
+Full contribution guidelines are available in the repository at
+`doc/developers/contributing.rst`, or online at:
+
+http://scikit-learn.org/dev/developers/contributing.html
+
+Quick links to:
+* [Submitting a bug report or feature request](http://scikit-learn.org/dev/developers/contributing.html#submitting-a-bug-report-or-a-feature-request)
+* [Contributing code](http://scikit-learn.org/dev/developers/contributing.html#contributing-code)
+* [Coding guidelines](http://scikit-learn.org/dev/developers/contributing.html#coding-guidelines)
+* [Tips to read current code](http://scikit-learn.org/dev/developers/contributing.html#reading-code)
+
+Code of Conduct
+---------------
+
+We abide by the principles of openness, respect, and consideration of others
+of the Python Software Foundation: https://www.python.org/psf/codeofconduct/.
diff --git a/MichaelHiggins/scikit-learn/COPYING b/MichaelHiggins/scikit-learn/COPYING
@@ -0,0 +1,32 @@
+New BSD License
+
+Copyright (c) 2007–2019 The scikit-learn developers.
+All rights reserved.
+
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+  a. Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+  b. Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+  c. Neither the name of the Scikit-learn Developers  nor the names of
+     its contributors may be used to endorse or promote products
+     derived from this software without specific prior written
+     permission. 
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGE.
+
diff --git a/MichaelHiggins/scikit-learn/ISSUE_TEMPLATE.md b/MichaelHiggins/scikit-learn/ISSUE_TEMPLATE.md
@@ -0,0 +1,57 @@
+<!--
+If your issue is a usage question, submit it here instead:
+- StackOverflow with the scikit-learn tag: https://stackoverflow.com/questions/tagged/scikit-learn
+- Mailing List: https://mail.python.org/mailman/listinfo/scikit-learn
+For more information, see User Questions: http://scikit-learn.org/stable/support.html#user-questions
+-->
+
+<!-- Instructions For Filing a Bug: https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md#filing-bugs -->
+
+#### Description
+<!-- Example: Joblib Error thrown when calling fit on LatentDirichletAllocation with evaluate_every > 0-->
+
+#### Steps/Code to Reproduce
+<!--
+Example:
+```python
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.decomposition import LatentDirichletAllocation
+
+docs = ["Help I have a bug" for i in range(1000)]
+
+vectorizer = CountVectorizer(input=docs, analyzer='word')
+lda_features = vectorizer.fit_transform(docs)
+
+lda_model = LatentDirichletAllocation(
+    n_topics=10,
+    learning_method='online',
+    evaluate_every=10,
+    n_jobs=4,
+)
+model = lda_model.fit(lda_features)
+```
+If the code is too long, feel free to put it in a public gist and link
+it in the issue: https://gist.github.com
+-->
+
+#### Expected Results
+<!-- Example: No error is thrown. Please paste or describe the expected results.-->
+
+#### Actual Results
+<!-- Please paste or specifically describe the actual output or traceback. -->
+
+#### Versions
+<!--
+Please run the following snippet and paste the output below.
+For scikit-learn >= 0.20:
+import sklearn; sklearn.show_versions()
+For scikit-learn < 0.20:
+import platform; print(platform.platform())
+import sys; print("Python", sys.version)
+import numpy; print("NumPy", numpy.__version__)
+import scipy; print("SciPy", scipy.__version__)
+import sklearn; print("Scikit-Learn", sklearn.__version__)
+-->
+
+
+<!-- Thanks for contributing! -->
diff --git a/MichaelHiggins/scikit-learn/MANIFEST.in b/MichaelHiggins/scikit-learn/MANIFEST.in
@@ -0,0 +1,8 @@
+include *.rst
+recursive-include doc *
+recursive-include examples *
+recursive-include sklearn *.c *.h *.pyx *.pxd *.pxi *.tp
+recursive-include sklearn/datasets *.csv *.csv.gz *.rst *.jpg *.txt *.arff.gz *.json.gz
+include COPYING
+include README.rst
+
diff --git a/MichaelHiggins/scikit-learn/Makefile b/MichaelHiggins/scikit-learn/Makefile
@@ -0,0 +1,63 @@
+# simple makefile to simplify repetitive build env management tasks under posix
+
+# caution: testing won't work on windows, see README
+
+PYTHON ?= python
+CYTHON ?= cython
+PYTEST ?= pytest
+CTAGS ?= ctags
+
+# skip doctests on 32bit python
+BITS := $(shell python -c 'import struct; print(8 * struct.calcsize("P"))')
+
+all: clean inplace test
+
+clean-ctags:
+	rm -f tags
+
+clean: clean-ctags
+	$(PYTHON) setup.py clean
+	rm -rf dist
+
+in: inplace # just a shortcut
+inplace:
+	$(PYTHON) setup.py build_ext -i
+
+test-code: in
+	$(PYTEST) --showlocals -v sklearn --durations=20
+test-sphinxext:
+	$(PYTEST) --showlocals -v doc/sphinxext/
+test-doc:
+ifeq ($(BITS),64)
+	$(PYTEST) $(shell find doc -name '*.rst' | sort)
+endif
+
+test-coverage:
+	rm -rf coverage .coverage
+	$(PYTEST) sklearn --showlocals -v --cov=sklearn --cov-report=html:coverage
+
+test: test-code test-sphinxext test-doc
+
+trailing-spaces:
+	find sklearn -name "*.py" -exec perl -pi -e 's/[ \t]*$$//' {} \;
+
+cython:
+	python setup.py build_src
+
+ctags:
+	# make tags for symbol based navigation in emacs and vim
+	# Install with: sudo apt-get install exuberant-ctags
+	$(CTAGS) --python-kinds=-i -R sklearn
+
+doc: inplace
+	$(MAKE) -C doc html
+
+doc-noplot: inplace
+	$(MAKE) -C doc html-noplot
+
+code-analysis:
+	flake8 sklearn | grep -v __init__ | grep -v external
+	pylint -E -i y sklearn/ -d E1103,E0611,E1101
+
+flake8-diff:
+	./build_tools/circle/flake8_diff.sh