From d850ce2b1680b0d48dff48cd561601efbc9ec711 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Wed, 8 Jul 2020 22:56:32 +0200 Subject: [PATCH 01/15] Add Automated Readability Index (ARI). Closes #20 Add ARI to visualization module. Add unit tests to test_visualization. Additionally import numpy in visualization and test_visualization to be able to return NaNs in Series. --- tests/test_visualization.py | 14 +++++++++ texthero/visualization.py | 61 +++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index 4ffa4566..71866469 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -1,6 +1,7 @@ import string import pandas as pd +import numpy as np import doctest from texthero import visualization @@ -59,3 +60,16 @@ def test_top_words_digits_punctuation(self): def test_wordcloud(self): s = pd.Series("one two three") self.assertEqual(visualization.wordcloud(s), None) + + """ + Test automated readability index + """ + + def test_ari(self): + s = pd.Series(["New York is a beautiful city.", "Look: New York!", "Wow"]) + s_true = pd.Series([3.0, 6.0, np.nan]) + self.assertEqual(visualization.automated_readability_index(s), s_true) + + def test_ari_numeric(self): + s = pd.Series([1.0, 2.0]) + self.assertRaises(ValueError, visualization.automated_readability_index, s) diff --git a/texthero/visualization.py b/texthero/visualization.py index c363fc17..47fb29eb 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -3,6 +3,7 @@ """ import pandas as pd +import numpy as np import plotly.express as px from wordcloud import WordCloud @@ -185,3 +186,63 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series: .explode() # one word for each line .value_counts(normalize=normalize) ) + + +def automated_readability_index(s: pd.Series) -> pd.Series: + """ + Calculate the automated readability index (ARI). + + Calculate ARI for each item in the given Pandas Series. Return a Pandas Series with the ARI scores. + Score is NaN if it cannot be computed (e.g. if the number of sentences is 0). + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["New York is a beautiful city.", "Look: New York!", "Wow"]) + >>> hero.automated_readability_index(s) + 0 3.0 + 1 6.0 + 2 NaN + dtype: float64 + + Reference + -------- + `Automated Readability Index `_ + + """ + try: + s = preprocessing.remove_whitespace( + s + ) # Whitespace is used to calculate number of words. + except: + raise ValueError("Input series has non-string items.") + + def _ari(text: str): + # Computes the ARI for one string. + # Straightforward implementation of the Wikipedia description. + if not isinstance(text, str): + return np.nan + + characters = sentences = words = 0 + + for char in text: + if char.isalnum(): + characters += 1 + elif char == " ": + words += 1 + elif char in [".", "!", "?"]: + sentences += 1 + else: + continue + + # Avoid 0-division. + if words > 0 and sentences > 0: + score = 4.71 * (characters / words) + 0.5 * (words / sentences) - 21.43 + score = np.ceil(score) + else: + score = np.nan + + return score + + return s.apply(_ari) From cecfbf8158098bd231b6a085821179216e171eb7 Mon Sep 17 00:00:00 2001 From: Jonathan Besomi <43236409+jbesomi@users.noreply.github.com> Date: Wed, 8 Jul 2020 21:46:45 +0200 Subject: [PATCH 02/15] Add MIT license --- LICENSE | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/LICENSE b/LICENSE index 18bce6cd..c0bda1a5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,7 +1,6 @@ The MIT License (MIT) -Texthero is licensed under the following MIT license: The MIT License (MIT) -Copyright © 2020 Texthero, https://texthero.org +Copyright (c) © 2020 Texthero, https://texthero.org Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From ad25ddc4cc8420d1599ac368c3686dd148bb441c Mon Sep 17 00:00:00 2001 From: Jonathan Besomi <43236409+jbesomi@users.noreply.github.com> Date: Wed, 8 Jul 2020 22:12:34 +0200 Subject: [PATCH 03/15] Add MIT license without url --- LICENSE | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE b/LICENSE index c0bda1a5..768ae902 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) © 2020 Texthero, https://texthero.org +Copyright (c) 2020 Texthero Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 5751f22645d9c2466a9a9c371b42bb87ef05649e Mon Sep 17 00:00:00 2001 From: Jonathan Besomi <43236409+jbesomi@users.noreply.github.com> Date: Thu, 9 Jul 2020 09:08:02 +0200 Subject: [PATCH 04/15] Update README.md --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index ce521fa9..0166e929 100644 --- a/README.md +++ b/README.md @@ -318,8 +318,7 @@ If you have just other questions or inquiry drop me a line at jonathanbesomi__AT The MIT License (MIT) -Texthero is licensed under the following MIT license: The MIT License (MIT) -Copyright © 2020 Texthero, https://texthero.org +Copyright (c) 2020 Texthero Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From 6186788d11239ad56162774782b938170582fdf3 Mon Sep 17 00:00:00 2001 From: Jonathan Besomi <43236409+jbesomi@users.noreply.github.com> Date: Thu, 9 Jul 2020 09:08:40 +0200 Subject: [PATCH 05/15] Website: fix github stars button --- website/pages/en/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/pages/en/index.js b/website/pages/en/index.js index 9f5cf330..aab4ddb2 100644 --- a/website/pages/en/index.js +++ b/website/pages/en/index.js @@ -81,7 +81,7 @@ const GithuButton = props => ( src="https://ghbtns.com/github-btn.html?user=jbesomi&repo=texthero&type=star&count=true&size=large" frameBorder={0} scrolling={0} - width={140} + width={160} height={30} title="GitHub Stars" style={{margin: "20px"}} From eb1164ec89cc9f1c8a640b48a74a7d3031f7f54d Mon Sep 17 00:00:00 2001 From: Jonathan Besomi <43236409+jbesomi@users.noreply.github.com> Date: Thu, 9 Jul 2020 09:09:21 +0200 Subject: [PATCH 06/15] Website: add css media queries for better responsiveness on mobile --- website/static/css/custom.css | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/website/static/css/custom.css b/website/static/css/custom.css index 797d5928..a970ae74 100644 --- a/website/static/css/custom.css +++ b/website/static/css/custom.css @@ -123,8 +123,7 @@ h1, h2, h3, h4, h5, h6 { padding: 10px 0; } -@media only screen and (min-device-width: 360px) and (max-device-width: 736px) { -} + @media only screen and (max-width: 1023px) { @@ -264,6 +263,23 @@ cite { margin-right: 0px; } +/* for small screen, do not position box to left and right */ + +/* @media only screen and (min-device-width: 360px) and (max-device-width: 736px) { */ + +@media only screen and (max-width: 750px){ + .right { + margin-left: 0px; + margin-right: auto; + } + + .homebox { + width: 100%; + } +} + + + .home_separator { width: 60%; border-top: 2px solid #3f88c5; From 7ac1649852eae657cc28762807ace9337d317d2f Mon Sep 17 00:00:00 2001 From: ishanarora04 Date: Thu, 9 Jul 2020 17:44:40 +0530 Subject: [PATCH 07/15] Added Remove Tags and Replace Tags (#50) * Added Remove Tags and Replace Tags * removed contributor --- tests/test_preprocessing.py | 22 ++++++++++++++++ texthero/preprocessing.py | 50 +++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index ee59c9da..9372d9f6 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -259,3 +259,25 @@ def test_tokenize_with_phrases(self): self.assertEqual( preprocessing.tokenize_with_phrases(s, min_count=3, threshold=1), s_true ) + + """ + Test replace and remove tags + """ + + def test_replace_tags(self): + s = pd.Series("Hi @tag, we will replace you") + s_true = pd.Series("Hi TAG, we will replace you") + + self.assertEqual(preprocessing.replace_tags(s, symbol="TAG"), s_true) + + def test_remove_tags_alphabets(self): + s = pd.Series("Hi @tag, we will remove you") + s_true = pd.Series("Hi , we will remove you") + + self.assertEqual(preprocessing.remove_tags(s), s_true) + + def test_remove_tags_numeric(self): + s = pd.Series("Hi @123, we will remove you") + s_true = pd.Series("Hi , we will remove you") + + self.assertEqual(preprocessing.remove_tags(s), s_true) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 14210f38..15668060 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -685,3 +685,53 @@ def remove_urls(s: pd.Series) -> pd.Series: """ return replace_urls(s, " ") + + +def replace_tags(s: pd.Series, symbol: str) -> pd.Series: + + r"""Replace all tags from a given Pandas Series + + `replace_tags` replace all tags in the given Pandas Series with symbol + + A tag is a string formed by @ concatenated with a sequence composed of characters and digits. Example: @texthero123 + + Parameters + ---------- + s : Pandas Series + symbol : The tag will be replaced by this symbol + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series("Hi @texthero123, we will replace you") + >>> hero.replace_tags(s, symbol='TAG') + 0 Hi TAG, we will replace you + dtype: object + + """ + + pattern = r"@([a-zA-Z0-9]+)" + return s.str.replace(pattern, symbol) + + +def remove_tags(s: pd.Series) -> pd.Series: + + r"""Remove all tags from a given Pandas Series + + `remove_tags` removes any tags and replaces them with an empty space. + + A tag is a string formed by @ concatenated with a sequence composed of characters and digits. Example: @texthero123 + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series("Hi @tag, we will remove you") + >>> hero.remove_tags(s) + 0 Hi , we will remove you + dtype: object + + """ + + return replace_tags(s, " ") From 6045edfa2c9e7d658d1897f802a57e3f17afb37d Mon Sep 17 00:00:00 2001 From: Jonathan Besomi <43236409+jbesomi@users.noreply.github.com> Date: Thu, 9 Jul 2020 14:23:10 +0200 Subject: [PATCH 08/15] remove_tags and replace_tags: improve docstring --- texthero/preprocessing.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 15668060..e7da8a42 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -688,17 +688,15 @@ def remove_urls(s: pd.Series) -> pd.Series: def replace_tags(s: pd.Series, symbol: str) -> pd.Series: - - r"""Replace all tags from a given Pandas Series - - `replace_tags` replace all tags in the given Pandas Series with symbol + """Replace all tags from a given Pandas Series with symbol. - A tag is a string formed by @ concatenated with a sequence composed of characters and digits. Example: @texthero123 + A tag is a string formed by @ concatenated with a sequence of characters and digits. Example: @texthero123. Parameters ---------- - s : Pandas Series - symbol : The tag will be replaced by this symbol + s : Pandas Series + symbols : str + Symbols to replace Examples -------- @@ -708,7 +706,7 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series: >>> hero.replace_tags(s, symbol='TAG') 0 Hi TAG, we will replace you dtype: object - + """ pattern = r"@([a-zA-Z0-9]+)" @@ -716,12 +714,9 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series: def remove_tags(s: pd.Series) -> pd.Series: - - r"""Remove all tags from a given Pandas Series + """Remove all tags from a given Pandas Series. - `remove_tags` removes any tags and replaces them with an empty space. - - A tag is a string formed by @ concatenated with a sequence composed of characters and digits. Example: @texthero123 + A tag is a string formed by @ concatenated with a sequence of characters and digits. Example: @texthero123. Tags are replaceb by an empty space ` `. Examples -------- @@ -731,7 +726,9 @@ def remove_tags(s: pd.Series) -> pd.Series: >>> hero.remove_tags(s) 0 Hi , we will remove you dtype: object - - """ + See also + -------- + :meth:`texthero.preprocessing.replace_tags` for replacing a tag with a custom symbol. + """ return replace_tags(s, " ") From 19925eedaa51b2c1ece70dcafee2a4eaf81a765f Mon Sep 17 00:00:00 2001 From: ishanarora04 Date: Thu, 9 Jul 2020 18:01:58 +0530 Subject: [PATCH 09/15] PR for contributor addition. (#52) * README.md * updated README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0166e929..4a912a88 100644 --- a/README.md +++ b/README.md @@ -312,6 +312,7 @@ If you have just other questions or inquiry drop me a line at jonathanbesomi__AT - [Dan Keefe](https://github.com/Peritract) - [Christian Claus](https://github.com/cclauss) - [bobfang1992](https://github.com/bobfang1992) +- [Ishan Arora](https://github.com/ishanarora04)

License

From 4337b07d1fb228b60b519871a8abea80eb28bdbf Mon Sep 17 00:00:00 2001 From: Jonathan Besomi <43236409+jbesomi@users.noreply.github.com> Date: Thu, 9 Jul 2020 16:22:42 +0200 Subject: [PATCH 10/15] Update CONTRIBUTING.md --- CONTRIBUTING.md | 196 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 133 insertions(+), 63 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 17fa448d..f476dc36 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,30 +1,143 @@ -# Contributing +# CONTRIBUTING -Hello there! +Hello and welcome to Texthero! -Thank you for being here. Texthero is maintained by [jbesomi](https://github.com/jbesomi). He is glad to receive your help. +This document contains all the important information you need to get started contributing. -## Getting started -If you feel you want to help and do not know where to start, you may start with the `good first issue` [issues](https://github.com/jbesomi/texthero/issues). +## Vision -## Development workflow +In case you are interested in the Texthero's vision as well as the core-principle, have a look at [PURPOSE.md](./PURPOSE.md) -The next steps will guide you towards making contributions to this repository. You just have to follows step-by-step. If anything is not clear or you have an idea on how to improve this document, feel free to edit it and open a pull request. -In case you need a more broad vision on how contributions work on Github, please refers to the [Github Guides](https://guides.github.com/). For getting started, read also [Creating a pull request from a fork](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request-from-a-fork). +## Quality -If you are used to the Github workflow, you can find at the end of this document a summary of the most important parts. +Texthero's main goal is to make the NLP-developer life _easier_. It does so by +1. Provide a simple-yet-complete tool for NLP and text analytics +2. Empower the NLP developer with great documentation, simple getting started docs as well as (work in progress) clear and concise tutorials (blog). -1. Fork the repository - Click the `fork` button in the GitHub repository; this will create a copy of Texthero in your Github account. +To achieve all of this, Texthero's code and documentation must be of high quality. Having a clean, readable, and **tested** code drastically reduces the likelihood of introducing bugs, and having great documentation will facilitate the work of many NLP developers as well as the work of Texther's maintainers. -1. Clone the repository - To do that, you need to have [git](https://git-scm.com/) installed. Open the terminal and type + +## Shift-left testing + +Texthero follows an approach known as shift-left testing. According to [Wikipedia](https://en.wikipedia.org/wiki/Shift-left_testing): + +> Shift-left testing is an approach to software testing and system testing in which testing is performed earlier in the lifecycle. + +Shift-left testing reduces the number of bugs by attempting to solve the problem at the origin. Often many programming defects are not uncovered and fixed until after significant effort has been wasted on their implementation. Texthero's attempt to avoid this kind of issue. + + +## Improve documentation! + +A very important yet not particularly complex task consists in improving the documentation: many Texthero's users will be deeply grateful for your effort. + +For instance, as of now, [texthero.representation.nmf](https://texthero.org/docs/api/texthero.representation.nmf) is very poor. + +> Interested in improving this? It's pretty easy. Just copy-paste the docstring from texthero.representation.nmf and replace 'pca' with 'nmf' :D + + +## How to create a successful Pull Request on Texthero + +Making sure your pull requests do not break the code and bring something valuable to the project means that only _high quality_ pull requests are approved. + +The following link gives some advice on how to submit a successful pull request. + +1. Submit a successful PR is not hard. Have a look at all [previous PR](https://github.com/jbesomi/texthero/pulls?q=is%3Apr+is%3Aclosed) already approved. +1. **Extensively test your code**. Think at all possible edge cases. Look at similar tests for ideas. +1. In most cases, there exist an example of function or docstring very similar to your specific use-case. Before writing your own-code, look at what the other functions look like. +1. Before submitting, **test locally** that you pass all tests (see below under `testing`). +1. Respect the best practice (see below `best practice`) +1. Make sure your code is black-formatted (`./format.sh`, see `formatting`) + + + + +## Ask questions! + +We are there for you! If everything is unclear, just ask. We will do our best to answer you quickly. + +## Propose new ideas! + +Texthero is there for the NLP-community. If you have an idea on how we can improve it, let us know by opening a new [issues](https://github.com/jbesomi/texthero/issues). We will be glad to hear from you! + +## Best practices + +1. Read and respect the [numpydoc docstring guide](https://numpydoc.readthedocs.io/en/latest/format.html). Look at the code for similarity. +1. Give to your branch a meaningful name. Avoid using the master branch. + +## Good first issue + +If this is your first time contributing to Texthero, you might start by choosing a `good first issue` [issues](https://github.com/jbesomi/texthero/issues). + + +## Testing + +As you understood, Texthero is serious about testing. We strongly encourage contributors to embrace [test-driven development (TDD)](https://en.wikipedia.org/wiki/Test-driven_development). + +Tests are made with `unittest` from the python standard library: [Unit testing framework](https://docs.python.org/3/library/unittest.html) + +To execute all tests, you can simply +``` +$ cd scripts +$ ./tests.sh +``` + +Calling `./test.sh` is equivalent to execute form the _root_ `python3 -m unittest discover -s tests -t .` + + +**Important.** If you worked on a bug, you should add a test that checks the bug is not present anymore. This is extremely useful as it avoids to re-introduce the same bug again in the future. + + +### Passing doctests + +When executing `./test.sh` it will also check that the Examples in the docstrings are correct (doctests). + +Passing doctests might be a bit annoying sometimes. Let's look at this example for instance: + +``` +File "/home/travis/build/jbesomi/texthero/texthero/preprocessing.py", line 700, in texthero.preprocessing.remove_tags +Failed example: + hero.remove_tags(s) +Expected: + 0 instagram texthero + dtype: object +Got: + 0 instagram texthero + dtype: object +``` + +The docstring failed? Why? The reason is that somewhere in the `Example` section of docstring, we missed one or more white spaces ` `. + +### Travis CI + +When you submit your code, all code will be tested on different operating systems using Travis CI: [TRAVIS CI texthero](https://travis-ci.com/github/jbesomi/texthero). + +Make sure you pass all your test locally before opening a pull request! + +## Formatting + +Before submitting, make sure your code is formatted. Code formatting is done with [black](https://github.com/psf/black). ``` -$ git clone git@github.com:YOUR_USERNAME/texthero.git +cd scripts +./format.sh ``` + +Travis CI will check that the whole code is black-formatted. Make sure you format before submitting! + +> It's handy to install the black formatter directly on your IDE. + + +## Development workflow + +In case you need a more broad vision on how contributions work on Github, please refers to the [Github Guides](https://guides.github.com/). For getting started, you might find [Creating a pull request from a fork](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request-from-a-fork) useful. + +1. Fork the repository + +1. Clone the repository + 1. Connect your cloned repository to the _original_ repo ``` @@ -32,7 +145,7 @@ $ cd texthero $ git remote add upstream git@github.com:jbesomi/texthero.git ``` -> This first step needs to be done only once. If in the future you will want to make new changes, make sure your repository is synchronized with respect to the upstream: [Syncing a fork](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/syncing-a-fork). +> This first step needs to be done only once. But, in the future when you will want to make new changes, make sure your repository is synchronized with respect to the upstream: [Syncing a fork](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/syncing-a-fork). 1. Install texthero locally and his dev-dependencies @@ -44,7 +157,7 @@ $ pip install -e . > The `-e` will install the python package in 'development' mode. That way your changes will take effect immediately without the need to reinstall the package again. -1. Install development dependencies +1. Install development dependencies (only required if you want to change the website doc) Development dependencies need to be installed to update the website documentation, i.e the content in texthero.org. @@ -54,7 +167,6 @@ In most cases, you **do not need** to update this. Changes from pull requests wi pip install -e '.[dev]' ``` - 1. Create a new working branch You can name it as you wish. A good practice is to give the branch a meaningful name so others know what you are working on. @@ -77,17 +189,6 @@ Before opening a new pull-request, you should make sure that all tests still pas **Important.** If you worked on a bug, you should add a test that checks the bug is not present anymore. This is extremely useful as it avoids to re-introduce the same bug again in the future. -In this part, you need to execute: - - `./format.sh` that will format all code with `black` -- `./test.sh` that will test all unittests and doctests. - -> In the scripts folder there is also a `check.sh` shell script. Other than executing all tests, `check.sh` script will format again all the repository code and [update the documentation](#documentation) with the new changes. In most cases, you don't need to execute this one. To properly execute the check command, you need to make sure you have installed all the required dependencies, in particular Sphinx. - -``` -cd scripts -./format.sh -./test.sh -``` 1. Open a Pull Request (PR) @@ -100,32 +201,14 @@ The time to submit the PR has come. Head to your forked repository on Github. Th - `./formath.sh` - format all code with [black](https://github.com/psf/black) - `./check.sh` - - format the code with black (`format.sh`) - - update the Sphinx documentation for the website + - Format the code with black (`format.sh`) + - Update the Sphinx documentation for the website - Execute all test with `unittest` (`check.sh`) - - **This is the only and main file that must be called.** -## Good to know -1. Passing doctests might be a bit annoying sometimes. Let's look at this example for instance: - -``` -File "/home/travis/build/jbesomi/texthero/texthero/preprocessing.py", line 700, in texthero.preprocessing.remove_tags -Failed example: - hero.remove_tags(s) -Expected: - 0 instagram texthero - dtype: object -Got: - 0 instagram texthero - dtype: object -``` -The docstring failed but it's not particularly clear why, right? Here, the reason is that somewhere on the docstring `Example`, we missed one or more white spaces ` `. -## Conventions - -### Documentation and website +## Documentation: docstring Texthero docstring follows [NumPy/SciPy](https://numpydoc.readthedocs.io/en/latest/format.html) docstring style. For example: @@ -154,7 +237,6 @@ def remove_digits(input: pd.Series, only_blocks=True) -> pd.Series: ... ``` - ### Git commits - Strive for atomicity: 1 commit = 1 context. @@ -162,16 +244,4 @@ def remove_digits(input: pd.Series, only_blocks=True) -> pd.Series: - You can reference relevant issues using a hashtag plus the number of the issue. Example: `#1` -## Test-driven development - -Texthero is serious about testing. We strongly encourage contributors to embrace [test-driven development (TDD)](https://en.wikipedia.org/wiki/Test-driven_development). - -Tests are made with `unittest` from the python standard library: [Unit testing framework](https://docs.python.org/3/library/unittest.html) - -To execute all tests, you can simply -``` -$ cd scripts -$ ./tests.sh -``` - -Calling `./test.sh` is equivalent to execute form the _root_ `python3 -m unittest discover -s tests -t .` +**Work in progress:** this document is a work in progress. If you spot a mistake or you want to make something clear, open a pull request! From 301822dcc72c7d261fbe368ffc6fdf74a6dba33c Mon Sep 17 00:00:00 2001 From: Shreyas Minocha <11537232+shreyasminocha@users.noreply.github.com> Date: Thu, 9 Jul 2020 14:27:06 +0000 Subject: [PATCH 11/15] Fix language name (#53) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4a912a88..946879d7 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ Texthero is free, open-source and [well documented](https://texthero.org/docs) ( We hope you will find pleasure working with Texthero as we had during his development. -

Hablas español? क्या आप भारतीय बोलते हैं? 日本語が話せるのか?

+

Hablas español? क्या आप हिंदी बोलते हैं? 日本語が話せるのか?

Texthero has been developed for the whole NLP community. We know how hard is to deal with different NLP tools (NLTK, SpaCy, Gensim, TextBlob, Sklearn): that's why we developed Texthero, to simplify things. From fcb286e3e8625cb99c245d12fde7e761b218b0ef Mon Sep 17 00:00:00 2001 From: ishanarora04 Date: Sat, 11 Jul 2020 14:04:11 +0530 Subject: [PATCH 12/15] Preprocessing removed the capturing group from regex => it was unnecessary (#64) --- texthero/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index e7da8a42..0c947ad0 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -709,7 +709,7 @@ def replace_tags(s: pd.Series, symbol: str) -> pd.Series: """ - pattern = r"@([a-zA-Z0-9]+)" + pattern = r"@[a-zA-Z0-9]+" return s.str.replace(pattern, symbol) From 81411c2ab95210fe328511eb8ffe3eab89124b3a Mon Sep 17 00:00:00 2001 From: ishanarora04 Date: Sat, 11 Jul 2020 14:07:34 +0530 Subject: [PATCH 13/15] added replace hashtags and remove hashtag (#58) * added replace hashtags and remove hashtag * Fixed the Documentation * Preprocessing Hashtag Regex as a raw string --- tests/test_preprocessing.py | 16 +++++++++++++ texthero/preprocessing.py | 46 +++++++++++++++++++++++++++++++++++++ 2 files changed, 62 insertions(+) diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 9372d9f6..b15c48d5 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -281,3 +281,19 @@ def test_remove_tags_numeric(self): s_true = pd.Series("Hi , we will remove you") self.assertEqual(preprocessing.remove_tags(s), s_true) + + """ + Test replace and remove hashtags + """ + + def test_replace_hashtags(self): + s = pd.Series("Hi #hashtag, we will replace you") + s_true = pd.Series("Hi HASHTAG, we will replace you") + + self.assertEqual(preprocessing.replace_hashtags(s, symbol="HASHTAG"), s_true) + + def test_remove_hashtags(self): + s = pd.Series("Hi #hashtag_trending123, we will remove you") + s_true = pd.Series("Hi , we will remove you") + + self.assertEqual(preprocessing.remove_hashtags(s), s_true) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 0c947ad0..065aa2a0 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -732,3 +732,49 @@ def remove_tags(s: pd.Series) -> pd.Series: :meth:`texthero.preprocessing.replace_tags` for replacing a tag with a custom symbol. """ return replace_tags(s, " ") + + +def replace_hashtags(s: pd.Series, symbol: str) -> pd.Series: + """Replace all hashtags from a Pandas Series with symbol + + A hashtag is a string formed by # concatenated with a sequence of characters, digits and underscores. Example: #texthero_123. + + Parameters + ---------- + s : Pandas Series + symbols : str + Symbols to replace + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series("Hi #texthero_123, we will replace you.") + >>> hero.replace_hashtags(s, symbol='HASHTAG') + 0 Hi HASHTAG, we will replace you. + dtype: object + + """ + pattern = r"#[a-zA-Z0-9_]+" + return s.str.replace(pattern, symbol) + + +def remove_hashtags(s: pd.Series) -> pd.Series: + """Remove all hashtags from a given Pandas Series + + A hashtag is a string formed by # concatenated with a sequence of characters, digits and underscores. Example: #texthero_123. + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series("Hi #texthero_123, we will remove you.") + >>> hero.remove_hashtags(s) + 0 Hi , we will remove you. + dtype: object + + See also + -------- + :meth:`texthero.preprocessing.replace_hashtags` for replacing a hashtag with a custom symbol. + """ + return replace_hashtags(s, " ") From d50559d850add56735fc6a0ee35df2632720eadd Mon Sep 17 00:00:00 2001 From: henrifroese <50276689+henrifroese@users.noreply.github.com> Date: Sat, 11 Jul 2020 10:44:40 +0200 Subject: [PATCH 14/15] Add count_sentences function to nlp.py (#51) * Add count_sentences function to nlp.py Also add tests for the function to test_nlp.py * Implement suggestions from pull request. Add more tests, change style (docstring, tests naming). Remove unicode-casting to avoid unexpected behaviour. * Add link to spacy documentation. Additionally update index tests, they're cleaner now. Co-authored-by: Henri Froese --- tests/test_nlp.py | 32 ++++++++++++++++++++++++++++++++ texthero/nlp.py | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index bd062a9e..2df9db61 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -1,4 +1,5 @@ import pandas as pd +import numpy as np from texthero import nlp from . import PandasTestCase @@ -36,3 +37,34 @@ def test_noun_chunks(self): [[("Today", "NP", 0, 5), ("such a beautiful day", "NP", 9, 29)]] ) self.assertEqual(nlp.noun_chunks(s), s_true) + + """ + Count sentences. + """ + + def test_count_sentences(self): + s = pd.Series("I think ... it counts correctly. Doesn't it? Great!") + s_true = pd.Series(3) + self.assertEqual(nlp.count_sentences(s), s_true) + + def test_count_sentences_numeric(self): + s = pd.Series([13.0, 42.0]) + self.assertRaises(TypeError, nlp.count_sentences, s) + + def test_count_sentences_missing_value(self): + s = pd.Series(["Test.", np.nan]) + self.assertRaises(TypeError, nlp.count_sentences, s) + + def test_count_sentences_index(self): + s = pd.Series(["Test"], index=[5]) + counted_sentences_s = nlp.count_sentences(s) + t_same_index = pd.Series([""], index=[5]) + + self.assertTrue(counted_sentences_s.index.equals(t_same_index.index)) + + def test_count_sentences_wrong_index(self): + s = pd.Series(["Test", "Test"], index=[5, 6]) + counted_sentences_s = nlp.count_sentences(s) + t_different_index = pd.Series(["", ""], index=[5, 7]) + + self.assertFalse(counted_sentences_s.index.equals(t_different_index.index)) diff --git a/texthero/nlp.py b/texthero/nlp.py index d2da7af5..df32128e 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -11,11 +11,11 @@ def named_entities(s, package="spacy"): Return named-entities. Return a Pandas Series where each rows contains a list of tuples containing information regarding the given named entities. - + Tuple: (`entity'name`, `entity'label`, `starting character`, `ending character`) Under the hood, `named_entities` make use of Spacy name entity recognition. - + List of labels: - `PERSON`: People, including fictional. - `NORP`: Nationalities or religious or political groups. @@ -76,3 +76,33 @@ def noun_chunks(s): ) return pd.Series(noun_chunks, index=s.index) + + +def count_sentences(s: pd.Series) -> pd.Series: + """ + Count the number of sentences per cell in a Pandas Series. + + Return a new Pandas Series with the number of sentences per cell. + + This makes use of the SpaCy `sentencizer `. + + Examples + -------- + >>> import texthero as hero + >>> import pandas as pd + >>> s = pd.Series(["Yesterday I was in NY with Bill de Blasio. Great story...", "This is the F.B.I.! What? Open up!"]) + >>> hero.count_sentences(s) + 0 2 + 1 3 + dtype: int64 + """ + number_of_sentences = [] + + nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"]) + nlp.add_pipe(nlp.create_pipe("sentencizer")) # Pipe is only "sentencizer" + + for doc in nlp.pipe(s.values, batch_size=32): + sentences = len(list(doc.sents)) + number_of_sentences.append(sentences) + + return pd.Series(number_of_sentences, index=s.index) From 7e1ad2f0b530892042ebffe9c28429fe20174aae Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Sun, 12 Jul 2020 21:16:12 +0200 Subject: [PATCH 15/15] Improve automated_readability_index. Now incorporates suggested changes. Input checking done with pd.api.types.is_string_dtype. Not a permanent solution, will be improved by #60 etc. Co-authored-by: Maximilian Krahn --- tests/test_visualization.py | 17 ++++++++--- texthero/visualization.py | 58 +++++++++++++------------------------ 2 files changed, 33 insertions(+), 42 deletions(-) diff --git a/tests/test_visualization.py b/tests/test_visualization.py index 71866469..82180778 100644 --- a/tests/test_visualization.py +++ b/tests/test_visualization.py @@ -65,11 +65,20 @@ def test_wordcloud(self): Test automated readability index """ - def test_ari(self): + def test_automated_readability_index(self): s = pd.Series(["New York is a beautiful city.", "Look: New York!", "Wow"]) - s_true = pd.Series([3.0, 6.0, np.nan]) + s_true = pd.Series([3.0, 6.0, 0.0]) self.assertEqual(visualization.automated_readability_index(s), s_true) - def test_ari_numeric(self): + def test_automated_readability_index_index(self): + s = pd.Series( + ["New York is a beautiful city.", "Look: New York!", "Wow"], + index=[5, 6, 7], + ) + self.assertTrue( + visualization.automated_readability_index(s).index.equals(s.index) + ) + + def test_automated_readability_index_numeric(self): s = pd.Series([1.0, 2.0]) - self.assertRaises(ValueError, visualization.automated_readability_index, s) + self.assertRaises(TypeError, visualization.automated_readability_index, s) diff --git a/texthero/visualization.py b/texthero/visualization.py index 47fb29eb..7065a873 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -8,7 +8,7 @@ from wordcloud import WordCloud -from texthero import preprocessing +from texthero import preprocessing, nlp import string from matplotlib.colors import LinearSegmentedColormap as lsg @@ -159,7 +159,7 @@ def top_words(s: pd.Series, normalize=False) -> pd.Series: Return a pandas series with index the top words and as value the count. Tokenization: split by space and remove all punctuations that are not between characters. - + Parameters ---------- normalize : @@ -203,7 +203,7 @@ def automated_readability_index(s: pd.Series) -> pd.Series: >>> hero.automated_readability_index(s) 0 3.0 1 6.0 - 2 NaN + 2 0.0 dtype: float64 Reference @@ -211,38 +211,20 @@ def automated_readability_index(s: pd.Series) -> pd.Series: `Automated Readability Index `_ """ - try: - s = preprocessing.remove_whitespace( - s - ) # Whitespace is used to calculate number of words. - except: - raise ValueError("Input series has non-string items.") - - def _ari(text: str): - # Computes the ARI for one string. - # Straightforward implementation of the Wikipedia description. - if not isinstance(text, str): - return np.nan - - characters = sentences = words = 0 - - for char in text: - if char.isalnum(): - characters += 1 - elif char == " ": - words += 1 - elif char in [".", "!", "?"]: - sentences += 1 - else: - continue - - # Avoid 0-division. - if words > 0 and sentences > 0: - score = 4.71 * (characters / words) + 0.5 * (words / sentences) - 21.43 - score = np.ceil(score) - else: - score = np.nan - - return score - - return s.apply(_ari) + if not pd.api.types.is_string_dtype(s): + raise TypeError("Non-string values in given Series.") + + words_s = s.str.split().str.len() - 1 + characters_s = s.str.count(r"[a-zA-Z0-9]") # Regex for alphanumeric. + sentences_s = nlp.count_sentences(s) + + score_s = ( + 4.71 * (characters_s / words_s) + 0.5 * (words_s / sentences_s) - 21.43 + ) + score_s = np.ceil(score_s) + + # Pandas does not raise an Error when dividing by zero -> remove + # wrong values by ourselves. + score_s.loc[~np.isfinite(score_s)] = 0 + + return score_s