diff --git a/tests/test_indexes.py b/tests/test_indexes.py index cc041c3a..af570840 100644 --- a/tests/test_indexes.py +++ b/tests/test_indexes.py @@ -12,6 +12,11 @@ s_tokenized_lists = pd.Series([["Test", "Test2"], ["Test3"]], index=[5, 6]) s_numeric = pd.Series([5.0], index=[5]) s_numeric_lists = pd.Series([[5.0, 5.0], [6.0, 6.0]], index=[5, 6]) +s_representation_vectors = pd.Series( + [1.0, 0.0, 0.0, 1.0], + index=pd.MultiIndex.from_tuples([(5, "A"), (5, "B"), (7, "A"), (7, "C")]), +) + # Define all test cases. Every test case is a list # of [name of test case, function to test, tuple of valid input for the function]. @@ -71,9 +76,9 @@ lambda x: representation.flatten(representation.tfidf(x)), (s_tokenized_lists,), ], - ["pca", representation.pca, (s_numeric_lists, 0)], - ["nmf", representation.nmf, (s_numeric_lists,)], - ["tsne", representation.tsne, (s_numeric_lists,)], + ["pca", representation.pca, (s_representation_vectors,),], + ["nmf", representation.nmf, (s_representation_vectors,),], + ["tsne", representation.tsne, (s_representation_vectors,),], ["kmeans", representation.kmeans, (s_numeric_lists, 1)], ["dbscan", representation.dbscan, (s_numeric_lists,)], ["meanshift", representation.meanshift, (s_numeric_lists,)], @@ -107,7 +112,10 @@ def test_correct_index(self, name, test_function, valid_input): s = valid_input[0] result_s = test_function(*valid_input) t_same_index = pd.Series(s.values, s.index) - self.assertTrue(result_s.index.equals(t_same_index.index)) + if isinstance(s.index, pd.MultiIndex): # if Representation Series + self.assertTrue(result_s.index.equals(t_same_index.index.levels[0])) + else: + self.assertTrue(result_s.index.equals(t_same_index.index)) @parameterized.expand(test_cases) def test_incorrect_index(self, name, test_function, valid_input): diff --git a/tests/test_representation.py b/tests/test_representation.py index 036775af..06cdf7a4 100644 --- a/tests/test_representation.py +++ b/tests/test_representation.py @@ -60,6 +60,20 @@ def _tfidf(term, corpus, document_index): s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],) +s_representation_vectors_index = pd.MultiIndex.from_tuples( + [(5, "A"), (5, "B"), (7, "A"), (7, "C")] +) + +s_representation_vectors = pd.Series( + [1.0, 0.0, 0.0, 1.0], index=s_representation_vectors_index +) + +s_flat_vectors_index = pd.Index([5, 7]) + +s_flat_vectors = pd.Series( + [[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]], index=s_flat_vectors_index +) + test_cases_vectorization = [ # format: [function_name, function, correct output for tokenized input above, dtype of output] @@ -85,6 +99,16 @@ def _tfidf(term, corpus, document_index): ["tfidf", representation.tfidf, [2.0, 1.0], "float",], ] +test_cases_dim_reduction = [ + # format: [function_name, function, correct output for numeric input above, dtype of output] + [ + "pca", + representation.pca, + [[0.7071067811865475, 0.0], [-0.7071067811865475, 0.0]], + ], + ["nmf", representation.nmf, [[0.0, 1.0], [1.0, 0.0]]], +] + class AbstractRepresentationTest(PandasTestCase): """ @@ -163,7 +187,29 @@ def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *arg @parameterized.expand(test_cases_vectorization) def test_vectorization_arguments_to_sklearn(self, name, test_function, *args): try: - test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0) + test_function(s_tokenized, max_features=1, min_df=1, max_df=1.0) + except TypeError: + self.fail("Sklearn arguments not handled correctly.") + + """ + Dimensionality Reduction + """ + + @parameterized.expand(test_cases_dim_reduction) + def test_dim_reduction_simple_with_index( + self, name, test_function, correct_output_values + ): + s_true = pd.Series(correct_output_values, index=s_flat_vectors_index) + + result_s = test_function(s_representation_vectors, random_state=42) + + # check_less_precise True to prevent rounding errors from giving a Failure. + pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True) + + @parameterized.expand(test_cases_dim_reduction) + def test_dim_reduction_arguments_to_sklearn(self, name, test_function, *args): + try: + test_function(s_representation_vectors, n_components=2, random_state=42) except TypeError: self.fail("Sklearn arguments not handled correctly.") diff --git a/texthero/representation.py b/texthero/representation.py index 07b7706c..1e9dd93a 100644 --- a/texthero/representation.py +++ b/texthero/representation.py @@ -430,8 +430,17 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: are easily visible. The corpus can now be visualized in 3D and we can get a good first view of the data! + Be careful: PCA can *not* handle sparse input, so even when calling PCA with + a very sparse Representation Series, internally texthero will compute + the whole dense representation, so if you're working with big datasets, + you should probably use :meth:`texthero.representation.nmf` or + :meth:`texthero.representation.tsne` as they can handle sparse input. + In general, *pca* should be called after the text has already been represented to a matrix form. + The input has to be a Representation Series. + TODO add typing module link + Parameters ---------- s : Pandas Series @@ -466,9 +475,40 @@ def pca(s, n_components=2, random_state=None) -> pd.Series: -------- `PCA on Wikipedia `_ + Representation Series: TODO add tutorial link and typing module link + """ pca = PCA(n_components=n_components, random_state=random_state, copy=False) - return pd.Series(pca.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_representation(s): + + if pd.api.types.is_sparse(s): + s_coo_matrix = s.sparse.to_coo()[0] + if s_coo_matrix.shape[1] > 1000: + warnings.warn( + "Be careful. You are trying to compute PCA from a Sparse Pandas Series with a very large vocabulary." + " Principal Component Analysis normalize the data and this act requires to expand the input Sparse Matrix." + " This operation might take long. Consider using `svd_truncated` instead as it can deals with Sparse Matrix efficiently." + ) + else: + # Treat it as a Sparse matrix anyway for efficiency. + s = s.astype("Sparse") + s_coo_matrix = s.sparse.to_coo()[0] + + s_for_vectorization = s_coo_matrix.todense() # PCA cannot handle sparse input. + + # Else: no Representation Series -> fail + else: + raise ValueError( + f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." + ) + + s_out = pd.Series( + pca.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0), + ) + s_out = s_out.rename_axis(None) + + return s_out def nmf(s, n_components=2, random_state=None) -> pd.Series: @@ -488,6 +528,8 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: and calculate a vector for each document that places it correctly among the topics. + The input has to be a Representation Series. + TODO add tutorial link Parameters ---------- @@ -525,9 +567,35 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series: -------- `NMF on Wikipedia `_ + Representation Series: TODO add tutorial link and typing module link """ - nmf = NMF(n_components=n_components, init="random", random_state=random_state,) - return pd.Series(nmf.fit_transform(list(s)).tolist(), index=s.index) + + nmf = NMF(n_components=n_components, init=None, random_state=random_state) + + if _check_is_valid_representation(s): + + if pd.api.types.is_sparse(s): + s_coo_matrix = s.sparse.to_coo()[0] + else: + # Treat it as a Sparse matrix anyway for efficiency. + s = s.astype("Sparse") + s_coo_matrix = s.sparse.to_coo()[0] + + s_for_vectorization = s_coo_matrix # NMF can work with sparse input. + + # Else: no Representation Series -> fail + else: + raise ValueError( + f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." + ) + + s_out = pd.Series( + nmf.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0), + ) + + s_out = s_out.rename_axis(None) + + return s_out def tsne( @@ -554,6 +622,8 @@ def tsne( vector in such a way that the differences / similarities between documents are preserved. + The input has to be a Representation Series. + TODO add typing module link Parameters ---------- @@ -610,6 +680,8 @@ def tsne( -------- `t-SNE on Wikipedia `_ + Representation Series: TODO add tutorial link and typing module link + """ tsne = TSNE( n_components=n_components, @@ -619,7 +691,31 @@ def tsne( random_state=random_state, n_jobs=n_jobs, ) - return pd.Series(tsne.fit_transform(list(s)).tolist(), index=s.index) + + if _check_is_valid_representation(s): + + if pd.api.types.is_sparse(s): + s_coo_matrix = s.sparse.to_coo()[0] + else: + # Treat it as a Sparse matrix anyway for efficiency. + s = s.astype("Sparse") + s_coo_matrix = s.sparse.to_coo()[0] + + s_for_vectorization = s_coo_matrix # TSNE can work with sparse input. + + # Else: no Representation Series -> fail + else: + raise ValueError( + f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2." + ) + + s_out = pd.Series( + tsne.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0) + ) + + s_out = s_out.rename_axis(None) + + return s_out """ diff --git a/texthero/visualization.py b/texthero/visualization.py index b6f70397..829509fc 100644 --- a/texthero/visualization.py +++ b/texthero/visualization.py @@ -62,7 +62,7 @@ def scatterplot( >>> import pandas as pd >>> df = pd.DataFrame(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"], columns=["texts"]) >>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize) - >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.pca, n_components=3) # TODO: when others get Representation Support: remove flatten + >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.pca, n_components=3) >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.kmeans, n_clusters=2) # TODO: when others get Representation Support: remove flatten >>> hero.scatterplot(df, col="pca", color="topics", hover_data=["texts"]) # doctest: +SKIP """