From 07a05848b0883da41be3b27d7e6fb354d34fd7dd Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Wed, 5 Aug 2020 20:14:59 +0200 Subject: [PATCH 1/2] Implement support for Representation Series in "Dimensionality Reduction" funcionts of representaion module - Implement full support for Representation Series in "Dimensionality Reduction" functions of representation module - add appropriate parameterized tests in `test_representation.py` Note: the dimensionality reduction functions *do support* both flat and represenational input; this is because as discussed in #134 we will not implement a `to_repr` function, so users might have e.g. a flattened tfidf series they want to perform pca on -> they can use the function. Of course, we could remove the flat support (if we basically want to "forbid" users from using `VectorSeries` for tfidf/term_freq/count; we're not 100% sure. Co-authored-by: Maximilian Krahn --- tests/test_representation.py | 59 ++++++++++++++++++++++- texthero/representation.py | 92 ++++++++++++++++++++++++++++++++++-- texthero/visualization.py | 2 +- 3 files changed, 147 insertions(+), 6 deletions(-) diff --git a/tests/test_representation.py b/tests/test_representation.py index 036775af..bdeac7ae 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -60,6 +60,20 @@ def _tfidf(term, corpus, document_index): s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],) +s_representation_vectors_index = pd.MultiIndex.from_tuples( + [(5, "A"), (5, "B"), (7, "A"), (7, "C")] +) + +s_representation_vectors = pd.Series( + [1.0, 0.0, 0.0, 1.0], index=s_representation_vectors_index +) + +s_flat_vectors_index = pd.Index([5, 7]) + +s_flat_vectors = pd.Series( + [[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]], index=s_flat_vectors_index +) + test_cases_vectorization = [ # format: [function_name, function, correct output for tokenized input above, dtype of output] @@ -85,6 +99,16 @@ def _tfidf(term, corpus, document_index): ["tfidf", representation.tfidf, [2.0, 1.0], "float",], ] +test_cases_dim_reduction = [ + # format: [function_name, function, correct output for numeric input above, dtype of output] + [ + "pca", + representation.pca, + [[0.7071067811865475, 0.0], [-0.7071067811865475, 0.0]], + ], + ["nmf", representation.nmf, [[0.0, 1.0], [1.0, 0.0]]], +] + class AbstractRepresentationTest(PandasTestCase): """ @@ -163,7 +187,40 @@ def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *arg @parameterized.expand(test_cases_vectorization) def test_vectorization_arguments_to_sklearn(self, name, test_function, *args): try: - test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0) + test_function(s_tokenized, max_features=1, min_df=1, max_df=1.0) + except TypeError: + self.fail("Sklearn arguments not handled correctly.") + + """ + Dimensionality Reduction + """ + + @parameterized.expand(test_cases_dim_reduction) + def test_dim_reduction_simple_with_index( + self, name, test_function, correct_output_values + ): + s_true = pd.Series(correct_output_values, index=s_flat_vectors_index) + + result_s = test_function(s_representation_vectors, random_state=42) + + # check_less_precise True to prevent rounding errors from giving a Failure. + pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) + + @parameterized.expand(test_cases_dim_reduction) + def test_dim_reduction_flat_with_index( + self, name, test_function, correct_output_values + ): + s_true = pd.Series(correct_output_values, index=s_flat_vectors_index) + + result_s = test_function(s_flat_vectors, random_state=42) + + # check_less_precise True to prevent rounding errors from giving a Failure. + pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) + + @parameterized.expand(test_cases_dim_reduction) + def test_dim_reduction_arguments_to_sklearn(self, name, test_function, *args): + try: + test_function(s_representation_vectors, n_components=2, random_state=42) except TypeError: self.fail("Sklearn arguments not handled correctly.") diff --git a/texthero/representation.py b/texthero/representation.py index 07b7706c..a35445c8 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -432,6 +432,9 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: In general, *pca* should be called after the text has already been represented to a matrix form. + The input can either be a Document Representation Series or a flat Series. + TODO add tutorial link + Parameters ---------- s : Pandas Series @@ -466,9 +469,38 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: -------- `PCA on Wikipedia `_ + Document Representation Series: TODO add tutorial link + """ pca = PCA(n_components=n_components, random_state=random_state, copy=False) - return pd.Series(pca.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_representation(s): + + if pd.api.types.is_sparse(s): + s_coo_matrix = s.sparse.to_coo()[0] + if s_coo_matrix.shape[1] > 1000: + warnings.warn( + "Be careful. You are trying to compute PCA from a Sparse Pandas Series with a very large vocabulary." + " Principal Component Analysis normalize the data and this act requires to expand the input Sparse Matrix." + " This operation might take long. Consider using `svd_truncated` instead as it can deals with Sparse Matrix efficiently." + ) + else: + # Treat it as a Sparse matrix anyway for efficiency. + s = s.astype("Sparse") + s_coo_matrix = s.sparse.to_coo()[0] + + s_for_vectorization = s_coo_matrix.todense() # PCA cannot handle sparse input. + + # Else: no Document Representation Series -> like before + else: + s_for_vectorization = list(s) + + s_out = pd.Series( + pca.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0), + ) + s_out = s_out.rename_axis(None) + + return s_out def nmf(s, n_components=2, random_state=None) -> pd.Series: @@ -488,6 +520,8 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: and calculate a vector for each document that places it correctly among the topics. + The input can either be a Document Representation Series or a flat Series. + TODO add tutorial link Parameters ---------- @@ -525,9 +559,33 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: -------- `NMF on Wikipedia `_ + The input can either be a Document Representation Series or a flat Series. + TODO add tutorial link """ - nmf = NMF(n_components=n_components, init="random", random_state=random_state,) - return pd.Series(nmf.fit_transform(list(s)).tolist(), index=s.index) + nmf = NMF(n_components=n_components, init=None, random_state=random_state) + + if _check_is_valid_representation(s): + + if pd.api.types.is_sparse(s): + s_coo_matrix = s.sparse.to_coo()[0] + else: + # Treat it as a Sparse matrix anyway for efficiency. + s = s.astype("Sparse") + s_coo_matrix = s.sparse.to_coo()[0] + + s_for_vectorization = s_coo_matrix # NMF can work with sparse input. + + # Else: no Document Representation Series -> like before + else: + s_for_vectorization = list(s) + + s_out = pd.Series( + nmf.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0), + ) + + s_out = s_out.rename_axis(None) + + return s_out def tsne( @@ -554,6 +612,8 @@ def tsne( vector in such a way that the differences / similarities between documents are preserved. + The input can either be a Document Representation Series or a flat Series. + TODO add tutorial link Parameters ---------- @@ -610,6 +670,8 @@ def tsne( -------- `t-SNE on Wikipedia `_ + Document Representation Series: TODO add tutorial link + """ tsne = TSNE( n_components=n_components, @@ -619,7 +681,29 @@ def tsne( random_state=random_state, n_jobs=n_jobs, ) - return pd.Series(tsne.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_representation(s): + + if pd.api.types.is_sparse(s): + s_coo_matrix = s.sparse.to_coo()[0] + else: + # Treat it as a Sparse matrix anyway for efficiency. + s = s.astype("Sparse") + s_coo_matrix = s.sparse.to_coo()[0] + + s_for_vectorization = s_coo_matrix # TSNE can work with sparse input. + + # Else: no Document Representation Series -> like before + else: + s_for_vectorization = list(s) + + s_out = pd.Series( + tsne.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0) + ) + + s_out = s_out.rename_axis(None) + + return s_out """ diff --git a/texthero/visualization.py b/texthero/visualization.py index b6f70397..829509fc 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -62,7 +62,7 @@ def scatterplot( >>> import pandas as pd >>> df = pd.DataFrame(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"], columns=["texts"]) >>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize) - >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.pca, n_components=3) # TODO: when others get Representation Support: remove flatten + >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.pca, n_components=3) >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.kmeans, n_clusters=2) # TODO: when others get Representation Support: remove flatten >>> hero.scatterplot(df, col="pca", color="topics", hover_data=["texts"]) # doctest: +SKIP """ From 59d566fdd1c398cf6175b0e91e3da5aaa1bbc092 Mon Sep 17 00:00:00 2001 From: Henri Froese Date: Fri, 7 Aug 2020 19:16:51 +0200 Subject: [PATCH 2/2] Remove support for vector series input in dim. red. functions - dim. red. functions only support Representation Series input - appropriately change tests --- tests/test_indexes.py | 16 ++++++++++---- tests/test_representation.py | 11 ---------- texthero/representation.py | 42 +++++++++++++++++++++++------------- 3 files changed, 39 insertions(+), 30 deletions(-) diff --git a/tests/test_indexes.py b/tests/test_indexes.py index cc041c3a..af570840 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -12,6 +12,11 @@ s_tokenized_lists = pd.Series([["Test", "Test2"], ["Test3"]], index=[5, 6]) s_numeric = pd.Series([5.0], index=[5]) s_numeric_lists = pd.Series([[5.0, 5.0], [6.0, 6.0]], index=[5, 6]) +s_representation_vectors = pd.Series( + [1.0, 0.0, 0.0, 1.0], + index=pd.MultiIndex.from_tuples([(5, "A"), (5, "B"), (7, "A"), (7, "C")]), +) + # Define all test cases. Every test case is a list # of [name of test case, function to test, tuple of valid input for the function]. @@ -71,9 +76,9 @@ lambda x: representation.flatten(representation.tfidf(x)), (s_tokenized_lists,), ], - ["pca", representation.pca, (s_numeric_lists, 0)], - ["nmf", representation.nmf, (s_numeric_lists,)], - ["tsne", representation.tsne, (s_numeric_lists,)], + ["pca", representation.pca, (s_representation_vectors,),], + ["nmf", representation.nmf, (s_representation_vectors,),], + ["tsne", representation.tsne, (s_representation_vectors,),], ["kmeans", representation.kmeans, (s_numeric_lists, 1)], ["dbscan", representation.dbscan, (s_numeric_lists,)], ["meanshift", representation.meanshift, (s_numeric_lists,)], @@ -107,7 +112,10 @@ def test_correct_index(self, name, test_function, valid_input): s = valid_input[0] result_s = test_function(*valid_input) t_same_index = pd.Series(s.values, s.index) - self.assertTrue(result_s.index.equals(t_same_index.index)) + if isinstance(s.index, pd.MultiIndex): # if Representation Series + self.assertTrue(result_s.index.equals(t_same_index.index.levels[0])) + else: + self.assertTrue(result_s.index.equals(t_same_index.index)) @parameterized.expand(test_cases) def test_incorrect_index(self, name, test_function, valid_input): diff --git a/tests/test_representation.py b/tests/test_representation.py index bdeac7ae..06cdf7a4 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -206,17 +206,6 @@ def test_dim_reduction_simple_with_index( # check_less_precise True to prevent rounding errors from giving a Failure. pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) - @parameterized.expand(test_cases_dim_reduction) - def test_dim_reduction_flat_with_index( - self, name, test_function, correct_output_values - ): - s_true = pd.Series(correct_output_values, index=s_flat_vectors_index) - - result_s = test_function(s_flat_vectors, random_state=42) - - # check_less_precise True to prevent rounding errors from giving a Failure. - pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) - @parameterized.expand(test_cases_dim_reduction) def test_dim_reduction_arguments_to_sklearn(self, name, test_function, *args): try: diff --git a/texthero/representation.py b/texthero/representation.py index a35445c8..1e9dd93a 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -430,10 +430,16 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: are easily visible. The corpus can now be visualized in 3D and we can get a good first view of the data! + Be careful: PCA can *not* handle sparse input, so even when calling PCA with + a very sparse Representation Series, internally texthero will compute + the whole dense representation, so if you're working with big datasets, + you should probably use :meth:`texthero.representation.nmf` or + :meth:`texthero.representation.tsne` as they can handle sparse input. + In general, *pca* should be called after the text has already been represented to a matrix form. - The input can either be a Document Representation Series or a flat Series. - TODO add tutorial link + The input has to be a Representation Series. + TODO add typing module link Parameters ---------- @@ -469,7 +475,7 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: -------- `PCA on Wikipedia `_ - Document Representation Series: TODO add tutorial link + Representation Series: TODO add tutorial link and typing module link """ pca = PCA(n_components=n_components, random_state=random_state, copy=False) @@ -491,9 +497,11 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: s_for_vectorization = s_coo_matrix.todense() # PCA cannot handle sparse input. - # Else: no Document Representation Series -> like before + # Else: no Representation Series -> fail else: - s_for_vectorization = list(s) + raise ValueError( + f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." + ) s_out = pd.Series( pca.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0), @@ -520,7 +528,7 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: and calculate a vector for each document that places it correctly among the topics. - The input can either be a Document Representation Series or a flat Series. + The input has to be a Representation Series. TODO add tutorial link Parameters @@ -559,9 +567,9 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: -------- `NMF on Wikipedia `_ - The input can either be a Document Representation Series or a flat Series. - TODO add tutorial link + Representation Series: TODO add tutorial link and typing module link """ + nmf = NMF(n_components=n_components, init=None, random_state=random_state) if _check_is_valid_representation(s): @@ -575,9 +583,11 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: s_for_vectorization = s_coo_matrix # NMF can work with sparse input. - # Else: no Document Representation Series -> like before + # Else: no Representation Series -> fail else: - s_for_vectorization = list(s) + raise ValueError( + f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." + ) s_out = pd.Series( nmf.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0), @@ -612,8 +622,8 @@ def tsne( vector in such a way that the differences / similarities between documents are preserved. - The input can either be a Document Representation Series or a flat Series. - TODO add tutorial link + The input has to be a Representation Series. + TODO add typing module link Parameters ---------- @@ -670,7 +680,7 @@ def tsne( -------- `t-SNE on Wikipedia `_ - Document Representation Series: TODO add tutorial link + Representation Series: TODO add tutorial link and typing module link """ tsne = TSNE( @@ -693,9 +703,11 @@ def tsne( s_for_vectorization = s_coo_matrix # TSNE can work with sparse input. - # Else: no Document Representation Series -> like before + # Else: no Representation Series -> fail else: - s_for_vectorization = list(s) + raise ValueError( + f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." + ) s_out = pd.Series( tsne.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0)