Skip to content

Commit 6862fc4

Browse files
committed
Add groupBy benchmark.
1 parent ca5ff95 commit 6862fc4

3 files changed

Lines changed: 58 additions & 3 deletions

File tree

benchmark/Main.hs

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import qualified Data.Vector.Unboxed.Mutable as VUM
77

88
import Control.Monad (replicateM)
99
import Criterion.Main
10+
import DataFrame ((|>))
1011
import Data.Time
1112
import System.Process
1213
import System.Random.Stateful
@@ -55,10 +56,31 @@ pandas = do
5556
output <- readProcess "./benchmark/dataframe_benchmark/bin/python3" ["./benchmark/pandas/pandas_benchmark.py"] ""
5657
putStrLn output
5758

59+
groupByHaskell :: IO ()
60+
groupByHaskell = do
61+
df <- D.readCsv "./data/housing.csv"
62+
print $ df |> D.groupBy ["ocean_proximity"]
63+
|> D.aggregate [("median_house_value", D.Minimum), ("median_house_value", D.Maximum)]
64+
|> D.select ["ocean_proximity", "Minimum_median_house_value", "Maximum_median_house_value"]
65+
66+
groupByPolars :: IO ()
67+
groupByPolars = do
68+
output <- readProcess "./benchmark/dataframe_benchmark/bin/python3" ["./benchmark/polars/group_by.py"] ""
69+
putStrLn output
70+
71+
groupByPandas :: IO ()
72+
groupByPandas = do
73+
output <- readProcess "./benchmark/dataframe_benchmark/bin/python3" ["./benchmark/pandas/group_by.py"] ""
74+
putStrLn output
75+
5876
main = do
5977
defaultMain [
60-
bgroup "stats" [ bench "haskell" $ nfIO haskell
61-
, bench "polars" $ nfIO polars
62-
, bench "pandas" $ nfIO pandas
78+
bgroup "stats" [
79+
, bench "simpleStatsHaskell" $ nfIO haskell
80+
, bench "simpleStatsPandas" $ nfIO pandas
81+
, bench "simpleStatsPolars" $ nfIO polars
82+
, bench "groupByHaskell" $ nfIO groupByHaskell
83+
, bench "groupByPolars" $ nfIO groupByPolars
84+
, bench "groupByPandas" $ nfIO groupByPandas
6385
]
6486
]

benchmark/pandas/group_by.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import pandas as pd
2+
3+
df = pd.read_csv("./data/housing.csv")
4+
5+
# Group, aggregate, and rename in one shot
6+
agg_df = (
7+
df
8+
.groupby("ocean_proximity")["median_house_value"]
9+
.agg(
10+
Minimum_median_house_value="min",
11+
Maximum_median_house_value="max"
12+
)
13+
)
14+
15+
print(agg_df)

benchmark/polars/group_by.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import datetime
2+
import numpy as np
3+
import polars as pl
4+
5+
# ------------------------------------------------------------------------------
6+
7+
df = pl.read_csv("./data/housing.csv")
8+
9+
result = (
10+
df
11+
.group_by("ocean_proximity")
12+
.agg([
13+
pl.col("median_house_value").min().alias("Minimum_median_house_value"),
14+
pl.col("median_house_value").max().alias("Maximum_median_house_value")
15+
])
16+
)
17+
18+
print(result)

0 commit comments

Comments
 (0)