Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
164 changes: 164 additions & 0 deletions MichaelHiggins/MachineLearning.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.svm import LinearSVC\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.model_selection import GridSearchCV\n",
"from sklearn.datasets import load_files\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn import metrics"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"n_samples: 2000\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/astro/classes/Astro_300/miniconda/lib/python3.6/site-packages/sklearn/model_selection/_split.py:1978: FutureWarning: The default value of cv will change from 3 to 5 in version 0.22. Specify it explicitly to silence this warning.\n",
" warnings.warn(CV_WARNING, FutureWarning)\n",
"/astro/classes/Astro_300/miniconda/lib/python3.6/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
" \"the number of iterations.\", ConvergenceWarning)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 params - {'vect__ngram_range': (1, 1)}; mean - 0.82; std - 0.01\n",
"1 params - {'vect__ngram_range': (1, 2)}; mean - 0.83; std - 0.00\n",
" precision recall f1-score support\n",
"\n",
" neg 0.89 0.87 0.88 243\n",
" pos 0.88 0.90 0.89 257\n",
"\n",
" accuracy 0.89 500\n",
" macro avg 0.89 0.89 0.89 500\n",
"weighted avg 0.89 0.89 0.89 500\n",
"\n",
"[[211 32]\n",
" [ 25 232]]\n",
"1500\n"
]
}
],
"source": [
"if __name__ == \"__main__\":\n",
" # NOTE: we put the following in a 'if __name__ == \"__main__\"' protected\n",
" # block to be able to use a multi-core grid search that also works under\n",
" # Windows, see: http://docs.python.org/library/multiprocessing.html#windows\n",
" # The multiprocessing module is used as the backend of joblib.Parallel\n",
" # that is used when n_jobs != 1 in GridSearchCV\n",
"\n",
" # the training data folder must be passed as first argument\n",
" movie_reviews_data_folder = sys.argv[1]\n",
" dataset = load_files('scikit-learn/doc/tutorial/text_analytics/data/movie_reviews/txt_sentoken', shuffle=False)\n",
" print(\"n_samples: %d\" % len(dataset.data))\n",
"\n",
" # split the dataset in training and test set:\n",
" docs_train, docs_test, y_train, y_test = train_test_split(\n",
" dataset.data, dataset.target, test_size=0.25, random_state=None)\n",
"\n",
" # TASK: Build a vectorizer / classifier pipeline that filters out tokens\n",
" # that are too rare or too frequent\n",
" pipeline = Pipeline([\n",
" ('vect', TfidfVectorizer(min_df=3, max_df=0.95)),\n",
" ('clf', LinearSVC(C=1000)),\n",
" ])\n",
"\n",
" # TASK: Build a grid search to find out whether unigrams or bigrams are\n",
" # more useful.\n",
" # Fit the pipeline on the training set using grid search for the parameters\n",
" parameters = {\n",
" 'vect__ngram_range': [(1, 1), (1, 2)],\n",
" }\n",
" grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1)\n",
" grid_search.fit(docs_train, y_train)\n",
"\n",
" # TASK: print the mean and std for each candidate along with the parameter\n",
" # settings for all the candidates explored by grid search.\n",
" n_candidates = len(grid_search.cv_results_['params'])\n",
" for i in range(n_candidates):\n",
" print(i, 'params - %s; mean - %0.2f; std - %0.2f'\n",
" % (grid_search.cv_results_['params'][i],\n",
" grid_search.cv_results_['mean_test_score'][i],\n",
" grid_search.cv_results_['std_test_score'][i]))\n",
"\n",
" # TASK: Predict the outcome on the testing set and store it in a variable\n",
" # named y_predicted\n",
" y_predicted = grid_search.predict(docs_test)\n",
"\n",
" # Print the classification report\n",
" print(metrics.classification_report(y_test, y_predicted,\n",
" target_names=dataset.target_names))\n",
"\n",
" # Print and plot the confusion matrix\n",
" cm = metrics.confusion_matrix(y_test, y_predicted)\n",
" print(cm)\n",
"\n",
" print (len(y_train))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2000, 39659)"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn.feature_extraction.text import CountVectorizer\n",
"count_vect = CountVectorizer()\n",
"y_train_counts = count_vect.fit_transform(dataset.data)\n",
"y_train_counts.shape"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
43 changes: 43 additions & 0 deletions MichaelHiggins/scikit-learn/CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@

Contributing to scikit-learn
============================

There are many ways to contribute to scikit-learn, with the most common ones
being contribution of code or documentation to the project. Improving the
documentation is no less important than improving the library itself. If you
find a typo in the documentation, or have made improvements, do not hesitate to
send an email to the mailing list or preferably submit a GitHub pull request.
Documentation can be found under the
[doc/](https://github.com/scikit-learn/scikit-learn/tree/master/doc) directory.

But there are many other ways to help. In particular answering queries on the
[issue tracker](https://github.com/scikit-learn/scikit-learn/issues),
investigating bugs, and [reviewing other developers' pull
requests](http://scikit-learn.org/dev/developers/contributing.html#code-review-guidelines)
are very valuable contributions that decrease the burden on the project
maintainers.

Another way to contribute is to report issues you're facing, and give a "thumbs
up" on issues that others reported and that are relevant to you. It also helps
us if you spread the word: reference the project from your blog and articles,
link to it from your website, or simply star it in GitHub to say "I use it".

Guideline
---------

Full contribution guidelines are available in the repository at
`doc/developers/contributing.rst`, or online at:

http://scikit-learn.org/dev/developers/contributing.html

Quick links to:
* [Submitting a bug report or feature request](http://scikit-learn.org/dev/developers/contributing.html#submitting-a-bug-report-or-a-feature-request)
* [Contributing code](http://scikit-learn.org/dev/developers/contributing.html#contributing-code)
* [Coding guidelines](http://scikit-learn.org/dev/developers/contributing.html#coding-guidelines)
* [Tips to read current code](http://scikit-learn.org/dev/developers/contributing.html#reading-code)

Code of Conduct
---------------

We abide by the principles of openness, respect, and consideration of others
of the Python Software Foundation: https://www.python.org/psf/codeofconduct/.
32 changes: 32 additions & 0 deletions MichaelHiggins/scikit-learn/COPYING
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
New BSD License

Copyright (c) 2007–2019 The scikit-learn developers.
All rights reserved.


Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

a. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
b. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
c. Neither the name of the Scikit-learn Developers nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.


THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
DAMAGE.

57 changes: 57 additions & 0 deletions MichaelHiggins/scikit-learn/ISSUE_TEMPLATE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
<!--
If your issue is a usage question, submit it here instead:
- StackOverflow with the scikit-learn tag: https://stackoverflow.com/questions/tagged/scikit-learn
- Mailing List: https://mail.python.org/mailman/listinfo/scikit-learn
For more information, see User Questions: http://scikit-learn.org/stable/support.html#user-questions
-->

<!-- Instructions For Filing a Bug: https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md#filing-bugs -->

#### Description
<!-- Example: Joblib Error thrown when calling fit on LatentDirichletAllocation with evaluate_every > 0-->

#### Steps/Code to Reproduce
<!--
Example:
```python
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

docs = ["Help I have a bug" for i in range(1000)]

vectorizer = CountVectorizer(input=docs, analyzer='word')
lda_features = vectorizer.fit_transform(docs)

lda_model = LatentDirichletAllocation(
n_topics=10,
learning_method='online',
evaluate_every=10,
n_jobs=4,
)
model = lda_model.fit(lda_features)
```
If the code is too long, feel free to put it in a public gist and link
it in the issue: https://gist.github.com
-->

#### Expected Results
<!-- Example: No error is thrown. Please paste or describe the expected results.-->

#### Actual Results
<!-- Please paste or specifically describe the actual output or traceback. -->

#### Versions
<!--
Please run the following snippet and paste the output below.
For scikit-learn >= 0.20:
import sklearn; sklearn.show_versions()
For scikit-learn < 0.20:
import platform; print(platform.platform())
import sys; print("Python", sys.version)
import numpy; print("NumPy", numpy.__version__)
import scipy; print("SciPy", scipy.__version__)
import sklearn; print("Scikit-Learn", sklearn.__version__)
-->


<!-- Thanks for contributing! -->
8 changes: 8 additions & 0 deletions MichaelHiggins/scikit-learn/MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
include *.rst
recursive-include doc *
recursive-include examples *
recursive-include sklearn *.c *.h *.pyx *.pxd *.pxi *.tp
recursive-include sklearn/datasets *.csv *.csv.gz *.rst *.jpg *.txt *.arff.gz *.json.gz
include COPYING
include README.rst

63 changes: 63 additions & 0 deletions MichaelHiggins/scikit-learn/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# simple makefile to simplify repetitive build env management tasks under posix

# caution: testing won't work on windows, see README

PYTHON ?= python
CYTHON ?= cython
PYTEST ?= pytest
CTAGS ?= ctags

# skip doctests on 32bit python
BITS := $(shell python -c 'import struct; print(8 * struct.calcsize("P"))')

all: clean inplace test

clean-ctags:
rm -f tags

clean: clean-ctags
$(PYTHON) setup.py clean
rm -rf dist

in: inplace # just a shortcut
inplace:
$(PYTHON) setup.py build_ext -i

test-code: in
$(PYTEST) --showlocals -v sklearn --durations=20
test-sphinxext:
$(PYTEST) --showlocals -v doc/sphinxext/
test-doc:
ifeq ($(BITS),64)
$(PYTEST) $(shell find doc -name '*.rst' | sort)
endif

test-coverage:
rm -rf coverage .coverage
$(PYTEST) sklearn --showlocals -v --cov=sklearn --cov-report=html:coverage

test: test-code test-sphinxext test-doc

trailing-spaces:
find sklearn -name "*.py" -exec perl -pi -e 's/[ \t]*$$//' {} \;

cython:
python setup.py build_src

ctags:
# make tags for symbol based navigation in emacs and vim
# Install with: sudo apt-get install exuberant-ctags
$(CTAGS) --python-kinds=-i -R sklearn

doc: inplace
$(MAKE) -C doc html

doc-noplot: inplace
$(MAKE) -C doc html-noplot

code-analysis:
flake8 sklearn | grep -v __init__ | grep -v external
pylint -E -i y sklearn/ -d E1103,E0611,E1101

flake8-diff:
./build_tools/circle/flake8_diff.sh
Loading