From d420a73825196028a2cd4a5738eb6168dd2ea3b7 Mon Sep 17 00:00:00 2001 From: Jeethu Rao Date: Thu, 21 Mar 2024 12:25:34 +0000 Subject: [PATCH 1/4] Use numba-stats if available --- numerai_tools/scoring.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/numerai_tools/scoring.py b/numerai_tools/scoring.py index f97d14b..2ef0d3f 100644 --- a/numerai_tools/scoring.py +++ b/numerai_tools/scoring.py @@ -2,7 +2,12 @@ import numpy as np import pandas as pd -from scipy import stats + +try: + from numba_stats import norm +except ImportError: + from scipy.stats import norm + from sklearn.preprocessing import OneHotEncoder @@ -171,7 +176,7 @@ def gaussian(df: pd.DataFrame) -> pd.DataFrame: pd.DataFrame - the gaussianized data """ assert np.array_equal(df.index.sort_values(), df.index) - return df.apply(lambda series: stats.norm.ppf(series)) + return df.apply(lambda series: norm.ppf(series)) def orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray: From 6dfbe75edfa04da982e14147f781f33549492b92 Mon Sep 17 00:00:00 2001 From: Jeethu Rao Date: Thu, 21 Mar 2024 14:37:49 +0000 Subject: [PATCH 2/4] specify default loc and scale for norm.ppf from scipy impl --- numerai_tools/scoring.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/numerai_tools/scoring.py b/numerai_tools/scoring.py index 2ef0d3f..107660f 100644 --- a/numerai_tools/scoring.py +++ b/numerai_tools/scoring.py @@ -1,3 +1,4 @@ +import functools from typing import List, Tuple, Union, Optional import numpy as np @@ -176,7 +177,7 @@ def gaussian(df: pd.DataFrame) -> pd.DataFrame: pd.DataFrame - the gaussianized data """ assert np.array_equal(df.index.sort_values(), df.index) - return df.apply(lambda series: norm.ppf(series)) + return df.apply(functools.partial(norm.ppf, loc=0, scale=1)) def orthogonalize(v: np.ndarray, u: np.ndarray) -> np.ndarray: From 642b17ee7068f4f228acb76517d68d1d4cbaca61 Mon Sep 17 00:00:00 2001 From: Jeethu Rao Date: Thu, 23 May 2024 14:07:04 +0100 Subject: [PATCH 3/4] Add test and option to disable numba_stats with env var --- numerai_tools/scoring.py | 3 +++ tests/test_scoring.py | 27 ++++++++++++++++++++------- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/numerai_tools/scoring.py b/numerai_tools/scoring.py index 107660f..e7d8c6d 100644 --- a/numerai_tools/scoring.py +++ b/numerai_tools/scoring.py @@ -1,3 +1,4 @@ +import os import functools from typing import List, Tuple, Union, Optional @@ -5,6 +6,8 @@ import pandas as pd try: + if os.environ.get("DISABLE_NUMBA_STATS"): + raise ImportError from numba_stats import norm except ImportError: from scipy.stats import norm diff --git a/tests/test_scoring.py b/tests/test_scoring.py index 8cd9aa9..71c21d0 100644 --- a/tests/test_scoring.py +++ b/tests/test_scoring.py @@ -1,4 +1,5 @@ import unittest +from unittest.mock import patch import numpy as np import pandas as pd @@ -21,6 +22,9 @@ stake_weight, ) +from numba_stats import norm as numba_stats_norm +from scipy.stats import norm as scipy_stats_norm + class TestScoring(unittest.TestCase): def setUp(self): @@ -34,6 +38,11 @@ def setUp(self): self.pos_neg = pd.Series([0, -0, 0.5, -0.5, 1.0, -1.0, 2.0, -2.0]).rename( "pos_neg" ) + self.s = [x/4 for x in range(5)] + self.df = pd.DataFrame({ + "target": self.s, + "prediction": reversed(self.s) + }) def test_correlation(self): assert np.isclose(correlation(self.up, self.up), 1) @@ -201,10 +210,14 @@ def test_neutralize(self): ).all() def test_numerai_corr_doesnt_clobber_targets(self): - s = [x/4 for x in range(5)] - df = pd.DataFrame({ - "target": s, - "prediction": reversed(s) - }) - numerai_corr(df[["prediction"]], df["target"]) - assert pd.Series(s).equals(df["target"]), f"{s} != {list(df['target'].values)}" + numerai_corr(self.df[["prediction"]], self.df["target"]) + assert pd.Series(self.s).equals(self.df["target"]), \ + f"{self.s} != {list(self.df['target'].values)}" + + def test_numerai_corr_is_same_with_scipy_and_numba(self): + with patch("numerai_tools.scoring.norm", new=scipy_stats_norm): + corr1 = numerai_corr( + self.df[["prediction"]], self.df["target"]) + with patch("numerai_tools.scoring.norm", new=numba_stats_norm): + corr2 = numerai_corr(self.df[["prediction"]], self.df["target"]) + assert np.isclose(corr1, corr2) From 58188063cbd7d60066367e41982424e54052bd6d Mon Sep 17 00:00:00 2001 From: Jeethu Rao Date: Thu, 23 May 2024 14:41:32 +0100 Subject: [PATCH 4/4] Add numba-stats to dev extras_require in setup.py --- .github/workflows/test-and-deploy.yml | 2 +- setup.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-and-deploy.yml b/.github/workflows/test-and-deploy.yml index 4b5612f..7bd8eea 100644 --- a/.github/workflows/test-and-deploy.yml +++ b/.github/workflows/test-and-deploy.yml @@ -24,7 +24,7 @@ jobs: - name: Test id: test run: | - pip install . + pip install .[dev] python -m unittest discover tests/ - name: Deploy diff --git a/setup.py b/setup.py index eba44f0..835a92d 100644 --- a/setup.py +++ b/setup.py @@ -41,4 +41,7 @@ def load(path): "scipy~=1.11.4", "scikit-learn>=1.3.0", ], + extras_require={ + "dev": ["numba-stats>=1.7.0"] + }, )