From 07a05848b0883da41be3b27d7e6fb354d34fd7dd Mon Sep 17 00:00:00 2001
From: Henri Froese <henri.froese@yahoo.com>
Date: Wed, 5 Aug 2020 20:14:59 +0200
Subject: [PATCH 1/2] Implement support for Representation Series in
 "Dimensionality Reduction" funcionts of representaion module

- Implement full support for Representation Series in "Dimensionality Reduction" functions of representation module
- add appropriate parameterized tests in `test_representation.py`

Note: the dimensionality reduction functions *do support* both flat and represenational input; this is because as discussed in #134 we will not implement a `to_repr` function, so users might have e.g. a flattened tfidf series they want to perform pca on -> they can use the function. Of course, we could remove the flat support (if we basically want to "forbid" users from using `VectorSeries` for tfidf/term_freq/count; we're not 100% sure.

Co-authored-by: Maximilian Krahn <maximilian.krahn@icloud.com>
---
 tests/test_representation.py | 59 ++++++++++++++++++++++-
 texthero/representation.py   | 92 ++++++++++++++++++++++++++++++++++--
 texthero/visualization.py    |  2 +-
 3 files changed, 147 insertions(+), 6 deletions(-)

diff --git a/tests/test_representation.py b/tests/test_representation.py
index 036775af..bdeac7ae 100644
--- a/tests/test_representation.py
+++ b/tests/test_representation.py
@@ -60,6 +60,20 @@ def _tfidf(term, corpus, document_index):
 
 s_tokenized_output_min_df_index = pd.MultiIndex.from_tuples([(0, "Test"), (1, "Test")],)
 
+s_representation_vectors_index = pd.MultiIndex.from_tuples(
+    [(5, "A"), (5, "B"), (7, "A"), (7, "C")]
+)
+
+s_representation_vectors = pd.Series(
+    [1.0, 0.0, 0.0, 1.0], index=s_representation_vectors_index
+)
+
+s_flat_vectors_index = pd.Index([5, 7])
+
+s_flat_vectors = pd.Series(
+    [[1.0, 0.0, 0.0], [0.0, 0.0, 1.0]], index=s_flat_vectors_index
+)
+
 
 test_cases_vectorization = [
     # format: [function_name, function, correct output for tokenized input above, dtype of output]
@@ -85,6 +99,16 @@ def _tfidf(term, corpus, document_index):
     ["tfidf", representation.tfidf, [2.0, 1.0], "float",],
 ]
 
+test_cases_dim_reduction = [
+    # format: [function_name, function, correct output for numeric input above, dtype of output]
+    [
+        "pca",
+        representation.pca,
+        [[0.7071067811865475, 0.0], [-0.7071067811865475, 0.0]],
+    ],
+    ["nmf", representation.nmf, [[0.0, 1.0], [1.0, 0.0]]],
+]
+
 
 class AbstractRepresentationTest(PandasTestCase):
     """
@@ -163,7 +187,40 @@ def test_vectorization_not_tokenized_yet_warning(self, name, test_function, *arg
     @parameterized.expand(test_cases_vectorization)
     def test_vectorization_arguments_to_sklearn(self, name, test_function, *args):
         try:
-            test_function(s_not_tokenized, max_features=1, min_df=1, max_df=1.0)
+            test_function(s_tokenized, max_features=1, min_df=1, max_df=1.0)
+        except TypeError:
+            self.fail("Sklearn arguments not handled correctly.")
+
+    """
+    Dimensionality Reduction
+    """
+
+    @parameterized.expand(test_cases_dim_reduction)
+    def test_dim_reduction_simple_with_index(
+        self, name, test_function, correct_output_values
+    ):
+        s_true = pd.Series(correct_output_values, index=s_flat_vectors_index)
+
+        result_s = test_function(s_representation_vectors, random_state=42)
+
+        # check_less_precise True to prevent rounding errors from giving a Failure.
+        pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True)
+
+    @parameterized.expand(test_cases_dim_reduction)
+    def test_dim_reduction_flat_with_index(
+        self, name, test_function, correct_output_values
+    ):
+        s_true = pd.Series(correct_output_values, index=s_flat_vectors_index)
+
+        result_s = test_function(s_flat_vectors, random_state=42)
+
+        # check_less_precise True to prevent rounding errors from giving a Failure.
+        pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True)
+
+    @parameterized.expand(test_cases_dim_reduction)
+    def test_dim_reduction_arguments_to_sklearn(self, name, test_function, *args):
+        try:
+            test_function(s_representation_vectors, n_components=2, random_state=42)
         except TypeError:
             self.fail("Sklearn arguments not handled correctly.")
 
diff --git a/texthero/representation.py b/texthero/representation.py
index 07b7706c..a35445c8 100644
--- a/texthero/representation.py
+++ b/texthero/representation.py
@@ -432,6 +432,9 @@ def pca(s, n_components=2, random_state=None) -> pd.Series:
 
     In general, *pca* should be called after the text has already been represented to a matrix form.
 
+    The input can either be a Document Representation Series or a flat Series.
+    TODO add tutorial link
+
     Parameters
     ----------
     s : Pandas Series
@@ -466,9 +469,38 @@ def pca(s, n_components=2, random_state=None) -> pd.Series:
     --------
     `PCA on Wikipedia <https://en.wikipedia.org/wiki/Principal_component_analysis>`_
 
+    Document Representation Series: TODO add tutorial link
+
     """
     pca = PCA(n_components=n_components, random_state=random_state, copy=False)
-    return pd.Series(pca.fit_transform(list(s)).tolist(), index=s.index)
+
+    if _check_is_valid_representation(s):
+
+        if pd.api.types.is_sparse(s):
+            s_coo_matrix = s.sparse.to_coo()[0]
+            if s_coo_matrix.shape[1] > 1000:
+                warnings.warn(
+                    "Be careful. You are trying to compute PCA from a Sparse Pandas Series with a very large vocabulary."
+                    " Principal Component Analysis normalize the data and this act requires to expand the input Sparse Matrix."
+                    " This operation might take long. Consider using `svd_truncated` instead as it can deals with Sparse Matrix efficiently."
+                )
+        else:
+            # Treat it as a Sparse matrix anyway for efficiency.
+            s = s.astype("Sparse")
+            s_coo_matrix = s.sparse.to_coo()[0]
+
+        s_for_vectorization = s_coo_matrix.todense()  # PCA cannot handle sparse input.
+
+    # Else: no Document Representation Series -> like before
+    else:
+        s_for_vectorization = list(s)
+
+    s_out = pd.Series(
+        pca.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0),
+    )
+    s_out = s_out.rename_axis(None)
+
+    return s_out
 
 
 def nmf(s, n_components=2, random_state=None) -> pd.Series:
@@ -488,6 +520,8 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series:
     and calculate a vector for each document that places it
     correctly among the topics.
 
+    The input can either be a Document Representation Series or a flat Series.
+    TODO add tutorial link
 
     Parameters
     ----------
@@ -525,9 +559,33 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series:
     --------
     `NMF on Wikipedia <https://en.wikipedia.org/wiki/Non-negative_matrix_factorization>`_
 
+    The input can either be a Document Representation Series or a flat Series.
+    TODO add tutorial link
     """
-    nmf = NMF(n_components=n_components, init="random", random_state=random_state,)
-    return pd.Series(nmf.fit_transform(list(s)).tolist(), index=s.index)
+    nmf = NMF(n_components=n_components, init=None, random_state=random_state)
+
+    if _check_is_valid_representation(s):
+
+        if pd.api.types.is_sparse(s):
+            s_coo_matrix = s.sparse.to_coo()[0]
+        else:
+            # Treat it as a Sparse matrix anyway for efficiency.
+            s = s.astype("Sparse")
+            s_coo_matrix = s.sparse.to_coo()[0]
+
+        s_for_vectorization = s_coo_matrix  # NMF can work with sparse input.
+
+    # Else: no Document Representation Series -> like before
+    else:
+        s_for_vectorization = list(s)
+
+    s_out = pd.Series(
+        nmf.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0),
+    )
+
+    s_out = s_out.rename_axis(None)
+
+    return s_out
 
 
 def tsne(
@@ -554,6 +612,8 @@ def tsne(
     vector in such a way that the differences / similarities between
     documents are preserved.
 
+    The input can either be a Document Representation Series or a flat Series.
+    TODO add tutorial link
 
     Parameters
     ----------
@@ -610,6 +670,8 @@ def tsne(
     --------
     `t-SNE on Wikipedia <https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding>`_
 
+    Document Representation Series: TODO add tutorial link
+
     """
     tsne = TSNE(
         n_components=n_components,
@@ -619,7 +681,29 @@ def tsne(
         random_state=random_state,
         n_jobs=n_jobs,
     )
-    return pd.Series(tsne.fit_transform(list(s)).tolist(), index=s.index)
+
+    if _check_is_valid_representation(s):
+
+        if pd.api.types.is_sparse(s):
+            s_coo_matrix = s.sparse.to_coo()[0]
+        else:
+            # Treat it as a Sparse matrix anyway for efficiency.
+            s = s.astype("Sparse")
+            s_coo_matrix = s.sparse.to_coo()[0]
+
+        s_for_vectorization = s_coo_matrix  # TSNE can work with sparse input.
+
+    # Else: no Document Representation Series -> like before
+    else:
+        s_for_vectorization = list(s)
+
+    s_out = pd.Series(
+        tsne.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0)
+    )
+
+    s_out = s_out.rename_axis(None)
+
+    return s_out
 
 
 """
diff --git a/texthero/visualization.py b/texthero/visualization.py
index b6f70397..829509fc 100644
--- a/texthero/visualization.py
+++ b/texthero/visualization.py
@@ -62,7 +62,7 @@ def scatterplot(
     >>> import pandas as pd
     >>> df = pd.DataFrame(["Football, Sports, Soccer", "music, violin, orchestra", "football, fun, sports", "music, fun, guitar"], columns=["texts"])
     >>> df["texts"] = hero.clean(df["texts"]).pipe(hero.tokenize)
-    >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.pca, n_components=3) # TODO: when others get Representation Support: remove flatten
+    >>> df["pca"] = hero.tfidf(df["texts"]).pipe(hero.pca, n_components=3)
     >>> df["topics"] = hero.tfidf(df["texts"]).pipe(hero.flatten).pipe(hero.kmeans, n_clusters=2) # TODO: when others get Representation Support: remove flatten
     >>> hero.scatterplot(df, col="pca", color="topics", hover_data=["texts"]) # doctest: +SKIP
     """

From 59d566fdd1c398cf6175b0e91e3da5aaa1bbc092 Mon Sep 17 00:00:00 2001
From: Henri Froese <henri.froese@yahoo.com>
Date: Fri, 7 Aug 2020 19:16:51 +0200
Subject: [PATCH 2/2] Remove support for vector series input in dim. red.
 functions

- dim. red. functions only support Representation Series input
- appropriately change tests
---
 tests/test_indexes.py        | 16 ++++++++++----
 tests/test_representation.py | 11 ----------
 texthero/representation.py   | 42 +++++++++++++++++++++++-------------
 3 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/tests/test_indexes.py b/tests/test_indexes.py
index cc041c3a..af570840 100644
--- a/tests/test_indexes.py
+++ b/tests/test_indexes.py
@@ -12,6 +12,11 @@
 s_tokenized_lists = pd.Series([["Test", "Test2"], ["Test3"]], index=[5, 6])
 s_numeric = pd.Series([5.0], index=[5])
 s_numeric_lists = pd.Series([[5.0, 5.0], [6.0, 6.0]], index=[5, 6])
+s_representation_vectors = pd.Series(
+    [1.0, 0.0, 0.0, 1.0],
+    index=pd.MultiIndex.from_tuples([(5, "A"), (5, "B"), (7, "A"), (7, "C")]),
+)
+
 
 # Define all test cases. Every test case is a list
 # of [name of test case, function to test, tuple of valid input for the function].
@@ -71,9 +76,9 @@
         lambda x: representation.flatten(representation.tfidf(x)),
         (s_tokenized_lists,),
     ],
-    ["pca", representation.pca, (s_numeric_lists, 0)],
-    ["nmf", representation.nmf, (s_numeric_lists,)],
-    ["tsne", representation.tsne, (s_numeric_lists,)],
+    ["pca", representation.pca, (s_representation_vectors,),],
+    ["nmf", representation.nmf, (s_representation_vectors,),],
+    ["tsne", representation.tsne, (s_representation_vectors,),],
     ["kmeans", representation.kmeans, (s_numeric_lists, 1)],
     ["dbscan", representation.dbscan, (s_numeric_lists,)],
     ["meanshift", representation.meanshift, (s_numeric_lists,)],
@@ -107,7 +112,10 @@ def test_correct_index(self, name, test_function, valid_input):
         s = valid_input[0]
         result_s = test_function(*valid_input)
         t_same_index = pd.Series(s.values, s.index)
-        self.assertTrue(result_s.index.equals(t_same_index.index))
+        if isinstance(s.index, pd.MultiIndex):  # if Representation Series
+            self.assertTrue(result_s.index.equals(t_same_index.index.levels[0]))
+        else:
+            self.assertTrue(result_s.index.equals(t_same_index.index))
 
     @parameterized.expand(test_cases)
     def test_incorrect_index(self, name, test_function, valid_input):
diff --git a/tests/test_representation.py b/tests/test_representation.py
index bdeac7ae..06cdf7a4 100644
--- a/tests/test_representation.py
+++ b/tests/test_representation.py
@@ -206,17 +206,6 @@ def test_dim_reduction_simple_with_index(
         # check_less_precise True to prevent rounding errors from giving a Failure.
         pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True)
 
-    @parameterized.expand(test_cases_dim_reduction)
-    def test_dim_reduction_flat_with_index(
-        self, name, test_function, correct_output_values
-    ):
-        s_true = pd.Series(correct_output_values, index=s_flat_vectors_index)
-
-        result_s = test_function(s_flat_vectors, random_state=42)
-
-        # check_less_precise True to prevent rounding errors from giving a Failure.
-        pd.testing.assert_series_equal(s_true, result_s, check_less_precise=True)
-
     @parameterized.expand(test_cases_dim_reduction)
     def test_dim_reduction_arguments_to_sklearn(self, name, test_function, *args):
         try:
diff --git a/texthero/representation.py b/texthero/representation.py
index a35445c8..1e9dd93a 100644
--- a/texthero/representation.py
+++ b/texthero/representation.py
@@ -430,10 +430,16 @@ def pca(s, n_components=2, random_state=None) -> pd.Series:
     are easily visible. The corpus can now be visualized in 3D and we can
     get a good first view of the data!
 
+    Be careful: PCA can *not* handle sparse input, so even when calling PCA with
+    a very sparse Representation Series, internally texthero will compute
+    the whole dense representation, so if you're working with big datasets,
+    you should probably use :meth:`texthero.representation.nmf` or
+    :meth:`texthero.representation.tsne` as they can handle sparse input.
+
     In general, *pca* should be called after the text has already been represented to a matrix form.
 
-    The input can either be a Document Representation Series or a flat Series.
-    TODO add tutorial link
+    The input has to be a Representation Series.
+    TODO add typing module link
 
     Parameters
     ----------
@@ -469,7 +475,7 @@ def pca(s, n_components=2, random_state=None) -> pd.Series:
     --------
     `PCA on Wikipedia <https://en.wikipedia.org/wiki/Principal_component_analysis>`_
 
-    Document Representation Series: TODO add tutorial link
+    Representation Series: TODO add tutorial link and typing module link
 
     """
     pca = PCA(n_components=n_components, random_state=random_state, copy=False)
@@ -491,9 +497,11 @@ def pca(s, n_components=2, random_state=None) -> pd.Series:
 
         s_for_vectorization = s_coo_matrix.todense()  # PCA cannot handle sparse input.
 
-    # Else: no Document Representation Series -> like before
+    # Else: no Representation Series -> fail
     else:
-        s_for_vectorization = list(s)
+        raise ValueError(
+            f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2."
+        )
 
     s_out = pd.Series(
         pca.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0),
@@ -520,7 +528,7 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series:
     and calculate a vector for each document that places it
     correctly among the topics.
 
-    The input can either be a Document Representation Series or a flat Series.
+    The input has to be a Representation Series.
     TODO add tutorial link
 
     Parameters
@@ -559,9 +567,9 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series:
     --------
     `NMF on Wikipedia <https://en.wikipedia.org/wiki/Non-negative_matrix_factorization>`_
 
-    The input can either be a Document Representation Series or a flat Series.
-    TODO add tutorial link
+    Representation Series: TODO add tutorial link and typing module link
     """
+
     nmf = NMF(n_components=n_components, init=None, random_state=random_state)
 
     if _check_is_valid_representation(s):
@@ -575,9 +583,11 @@ def nmf(s, n_components=2, random_state=None) -> pd.Series:
 
         s_for_vectorization = s_coo_matrix  # NMF can work with sparse input.
 
-    # Else: no Document Representation Series -> like before
+    # Else: no Representation Series -> fail
     else:
-        s_for_vectorization = list(s)
+        raise ValueError(
+            f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2."
+        )
 
     s_out = pd.Series(
         nmf.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0),
@@ -612,8 +622,8 @@ def tsne(
     vector in such a way that the differences / similarities between
     documents are preserved.
 
-    The input can either be a Document Representation Series or a flat Series.
-    TODO add tutorial link
+    The input has to be a Representation Series.
+    TODO add typing module link
 
     Parameters
     ----------
@@ -670,7 +680,7 @@ def tsne(
     --------
     `t-SNE on Wikipedia <https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding>`_
 
-    Document Representation Series: TODO add tutorial link
+    Representation Series: TODO add tutorial link and typing module link
 
     """
     tsne = TSNE(
@@ -693,9 +703,11 @@ def tsne(
 
         s_for_vectorization = s_coo_matrix  # TSNE can work with sparse input.
 
-    # Else: no Document Representation Series -> like before
+    # Else: no Representation Series -> fail
     else:
-        s_for_vectorization = list(s)
+        raise ValueError(
+            f"The input Pandas Series should be a Representation Pandas Series and should have a MultiIndex, where the first level represent the document and the second one the words/token. The given Pandas Series has {s.index.nlevels} number of levels instead of 2."
+        )
 
     s_out = pd.Series(
         tsne.fit_transform(s_for_vectorization).tolist(), index=s.index.unique(level=0)