Skip to content

Commit 797a9dd

Browse files
committed
feat: Change join order.
The join order was initially written with pipelines/chaining in mind but it makes reading the operations confusing. Fixes #156
1 parent c20127b commit 797a9dd

13 files changed

Lines changed: 5837 additions & 5147 deletions

File tree

dataframe.cabal

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,8 @@ test-suite tests
202202
other-modules: Assertions,
203203
Functions,
204204
GenDataFrame,
205+
Internal.Parsing,
206+
IO.JSON,
205207
Operations.Aggregations,
206208
Operations.Apply,
207209
Operations.Core,
@@ -217,9 +219,12 @@ test-suite tests
217219
Operations.Subset,
218220
Operations.Statistics,
219221
Operations.Take,
222+
Operations.Typing,
220223
Parquet,
224+
Properties,
221225
Monad
222226
build-depends: base >= 4 && < 5,
227+
bytestring >= 0.11 && < 0.13,
223228
dataframe >= 0.5 && < 1,
224229
directory >= 1.3.0.0 && < 2,
225230
HUnit ^>= 1.6,

src/DataFrame/Operations/Join.hs

Lines changed: 71 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -170,28 +170,32 @@ ghci> D.innerJoin ["key"] df other
170170
@
171171
-}
172172
innerJoin :: [T.Text] -> DataFrame -> DataFrame -> DataFrame
173-
innerJoin cs right left =
174-
let csSet = S.fromList cs
175-
leftRows = fst (D.dimensions left)
176-
rightRows = fst (D.dimensions right)
177-
178-
leftKeyIdxs = keyColIndices csSet left
179-
rightKeyIdxs = keyColIndices csSet right
180-
leftHashes = D.computeRowHashes leftKeyIdxs left
181-
rightHashes = D.computeRowHashes rightKeyIdxs right
182-
183-
buildRows = min leftRows rightRows
184-
(leftIxs, rightIxs)
185-
| buildRows > joinStrategyThreshold =
186-
sortMergeInnerKernel leftHashes rightHashes
187-
| rightRows <= leftRows =
188-
-- Build on right (smaller or equal), probe with left
189-
hashInnerKernel leftHashes rightHashes
190-
| otherwise =
191-
-- Build on left (smaller), probe with right, swap result
192-
let (!rIxs, !lIxs) = hashInnerKernel rightHashes leftHashes
193-
in (lIxs, rIxs)
194-
in assembleInner csSet left right leftIxs rightIxs
173+
innerJoin cs left right
174+
| D.null right || D.null left = D.empty
175+
| otherwise =
176+
let
177+
csSet = S.fromList cs
178+
leftRows = fst (D.dimensions left)
179+
rightRows = fst (D.dimensions right)
180+
181+
leftKeyIdxs = keyColIndices csSet left
182+
rightKeyIdxs = keyColIndices csSet right
183+
leftHashes = D.computeRowHashes leftKeyIdxs left
184+
rightHashes = D.computeRowHashes rightKeyIdxs right
185+
186+
buildRows = min leftRows rightRows
187+
(leftIxs, rightIxs)
188+
| buildRows > joinStrategyThreshold =
189+
sortMergeInnerKernel leftHashes rightHashes
190+
| rightRows <= leftRows =
191+
-- Build on right (smaller or equal), probe with left
192+
hashInnerKernel leftHashes rightHashes
193+
| otherwise =
194+
-- Build on left (smaller), probe with right, swap result
195+
let (!rIxs, !lIxs) = hashInnerKernel rightHashes leftHashes
196+
in (lIxs, rIxs)
197+
in
198+
assembleInner csSet left right leftIxs rightIxs
195199

196200
{- | Hash-based inner join kernel.
197201
Builds compact index on @buildHashes@ (second arg), probes with
@@ -369,23 +373,28 @@ ghci> D.leftJoin ["key"] df other
369373
@
370374
-}
371375
leftJoin :: [T.Text] -> DataFrame -> DataFrame -> DataFrame
372-
leftJoin cs right left =
373-
let csSet = S.fromList cs
374-
rightRows = fst (D.dimensions right)
375-
376-
leftKeyIdxs = keyColIndices csSet left
377-
rightKeyIdxs = keyColIndices csSet right
378-
leftHashes = D.computeRowHashes leftKeyIdxs left
379-
rightHashes = D.computeRowHashes rightKeyIdxs right
380-
381-
-- Right is always the build side for left join
382-
(leftIxs, rightIxs)
383-
| rightRows > joinStrategyThreshold =
384-
sortMergeLeftKernel leftHashes rightHashes
385-
| otherwise =
386-
hashLeftKernel leftHashes rightHashes
387-
in -- rightIxs uses -1 as sentinel for "no match"
388-
assembleLeft csSet left right leftIxs rightIxs
376+
leftJoin cs left right
377+
| D.null right || D.nRows right == 0 = left
378+
| D.null left || D.nRows left == 0 = D.empty
379+
| otherwise =
380+
let
381+
csSet = S.fromList cs
382+
rightRows = fst (D.dimensions right)
383+
384+
leftKeyIdxs = keyColIndices csSet left
385+
rightKeyIdxs = keyColIndices csSet right
386+
leftHashes = D.computeRowHashes leftKeyIdxs left
387+
rightHashes = D.computeRowHashes rightKeyIdxs right
388+
389+
-- Right is always the build side for left join
390+
(leftIxs, rightIxs)
391+
| rightRows > joinStrategyThreshold =
392+
sortMergeLeftKernel leftHashes rightHashes
393+
| otherwise =
394+
hashLeftKernel leftHashes rightHashes
395+
in
396+
-- rightIxs uses -1 as sentinel for "no match"
397+
assembleLeft csSet left right leftIxs rightIxs
389398

390399
{- | Hash-based left join kernel.
391400
Returns @(leftExpandedIndices, rightExpandedIndices)@ where
@@ -574,24 +583,29 @@ rightJoin cs left right = leftJoin cs right left
574583

575584
fullOuterJoin ::
576585
[T.Text] -> DataFrame -> DataFrame -> DataFrame
577-
fullOuterJoin cs right left =
578-
let csSet = S.fromList cs
579-
leftRows = fst (D.dimensions left)
580-
rightRows = fst (D.dimensions right)
581-
582-
leftKeyIdxs = keyColIndices csSet left
583-
rightKeyIdxs = keyColIndices csSet right
584-
leftHashes = D.computeRowHashes leftKeyIdxs left
585-
rightHashes = D.computeRowHashes rightKeyIdxs right
586-
587-
-- Both sides can have nulls in full outer
588-
(leftIxs, rightIxs)
589-
| max leftRows rightRows > joinStrategyThreshold =
590-
sortMergeFullOuterKernel leftHashes rightHashes
591-
| otherwise =
592-
hashFullOuterKernel leftHashes rightHashes
593-
in -- Both index vectors use -1 as sentinel
594-
assembleFullOuter csSet left right leftIxs rightIxs
586+
fullOuterJoin cs left right
587+
| D.null right || D.nRows right == 0 = left
588+
| D.null left || D.nRows left == 0 = right
589+
| otherwise =
590+
let
591+
csSet = S.fromList cs
592+
leftRows = fst (D.dimensions left)
593+
rightRows = fst (D.dimensions right)
594+
595+
leftKeyIdxs = keyColIndices csSet left
596+
rightKeyIdxs = keyColIndices csSet right
597+
leftHashes = D.computeRowHashes leftKeyIdxs left
598+
rightHashes = D.computeRowHashes rightKeyIdxs right
599+
600+
-- Both sides can have nulls in full outer
601+
(leftIxs, rightIxs)
602+
| max leftRows rightRows > joinStrategyThreshold =
603+
sortMergeFullOuterKernel leftHashes rightHashes
604+
| otherwise =
605+
hashFullOuterKernel leftHashes rightHashes
606+
in
607+
-- Both index vectors use -1 as sentinel
608+
assembleFullOuter csSet left right leftIxs rightIxs
595609

596610
{- | Hash-based full outer join kernel.
597611
Builds compact indices on both sides.

src/DataFrame/Typed/Join.hs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ leftJoin ::
4343
TypedDataFrame right ->
4444
TypedDataFrame (LeftJoinSchema keys left right)
4545
leftJoin (TDF l) (TDF r) =
46-
unsafeFreeze (DJ.leftJoin keyNames r l)
46+
unsafeFreeze (DJ.leftJoin keyNames l r)
4747
where
4848
keyNames = symbolVals @keys
4949

@@ -55,7 +55,7 @@ rightJoin ::
5555
TypedDataFrame right ->
5656
TypedDataFrame (RightJoinSchema keys left right)
5757
rightJoin (TDF l) (TDF r) =
58-
unsafeFreeze (DJ.rightJoin keyNames r l)
58+
unsafeFreeze (DJ.rightJoin keyNames l r)
5959
where
6060
keyNames = symbolVals @keys
6161

src/DataFrame/Typed/Operations.hs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ impute ::
214214
forall name a cols.
215215
( KnownSymbol name
216216
, Columnable a
217-
, (Maybe a) ~ Lookup name cols
217+
, Maybe a ~ Lookup name cols
218218
) =>
219219
a ->
220220
TypedDataFrame cols ->

tests/IO/JSON.hs

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
{-# LANGUAGE OverloadedStrings #-}
2+
3+
module IO.JSON where
4+
5+
import qualified Data.ByteString.Lazy.Char8 as LBSC
6+
import DataFrame.IO.JSON (readJSONEither)
7+
import qualified DataFrame.Internal.Column as DI
8+
import qualified DataFrame.Internal.DataFrame as DI
9+
import qualified DataFrame.Operations.Core as D
10+
import Test.HUnit (
11+
Test (TestCase, TestLabel),
12+
assertBool,
13+
assertEqual,
14+
assertFailure,
15+
)
16+
17+
-- | Happy path: array of objects with string and number columns.
18+
jsonHappyPath :: Test
19+
jsonHappyPath =
20+
TestCase
21+
( case readJSONEither
22+
(LBSC.pack "[{\"name\":\"Alice\",\"age\":30},{\"name\":\"Bob\",\"age\":25}]") of
23+
Left err -> assertFailure $ "Unexpected Left: " ++ err
24+
Right df -> do
25+
assertEqual "Happy path: 2 rows" 2 (D.nRows df)
26+
assertEqual "Happy path: 2 columns" 2 (D.nColumns df)
27+
)
28+
29+
-- | Boolean column is preserved correctly.
30+
jsonBoolColumn :: Test
31+
jsonBoolColumn =
32+
TestCase
33+
( case readJSONEither (LBSC.pack "[{\"flag\":true},{\"flag\":false}]") of
34+
Left err -> assertFailure $ "Unexpected Left: " ++ err
35+
Right df -> do
36+
assertEqual "Bool column: 2 rows" 2 (D.nRows df)
37+
assertEqual "Bool column: 1 column" 1 (D.nColumns df)
38+
)
39+
40+
-- | A key absent from some objects produces an Optional column.
41+
jsonMissingKeyBecomesOptional :: Test
42+
jsonMissingKeyBecomesOptional =
43+
TestCase
44+
( case readJSONEither (LBSC.pack "[{\"a\":1,\"b\":2},{\"a\":3}]") of
45+
Left err -> assertFailure $ "Unexpected Left: " ++ err
46+
Right df -> do
47+
assertEqual "Missing key: 2 rows" 2 (D.nRows df)
48+
assertEqual "Missing key: 2 columns" 2 (D.nColumns df)
49+
-- 'b' is absent from the second row, so must have a missing value
50+
assertBool
51+
"b column should have missing values"
52+
(maybe False DI.hasMissing (DI.getColumn "b" df))
53+
)
54+
55+
-- | When column values are different types across rows, the column becomes generic.
56+
jsonMixedTypeColumn :: Test
57+
jsonMixedTypeColumn =
58+
TestCase
59+
( case readJSONEither (LBSC.pack "[{\"x\":1},{\"x\":\"hello\"}]") of
60+
Left err -> assertFailure $ "Unexpected Left: " ++ err
61+
Right df -> do
62+
assertEqual "Mixed type: 2 rows" 2 (D.nRows df)
63+
assertEqual "Mixed type: 1 column" 1 (D.nColumns df)
64+
)
65+
66+
-- | An empty top-level JSON array is rejected.
67+
jsonEmptyArray :: Test
68+
jsonEmptyArray =
69+
TestCase
70+
( case readJSONEither (LBSC.pack "[]") of
71+
Left _ -> return ()
72+
Right _ -> assertFailure "Expected Left for empty array"
73+
)
74+
75+
-- | A non-array top-level JSON value is rejected.
76+
jsonNonArray :: Test
77+
jsonNonArray =
78+
TestCase
79+
( case readJSONEither (LBSC.pack "{\"name\":\"Alice\"}") of
80+
Left _ -> return ()
81+
Right _ -> assertFailure "Expected Left for non-array top-level value"
82+
)
83+
84+
-- | Array elements that are not objects are rejected.
85+
jsonNonObjectElement :: Test
86+
jsonNonObjectElement =
87+
TestCase
88+
( case readJSONEither (LBSC.pack "[1, 2, 3]") of
89+
Left _ -> return ()
90+
Right _ -> assertFailure "Expected Left for non-object array elements"
91+
)
92+
93+
-- | Completely invalid JSON is rejected.
94+
jsonInvalidJSON :: Test
95+
jsonInvalidJSON =
96+
TestCase
97+
( case readJSONEither (LBSC.pack "not valid json at all") of
98+
Left _ -> return ()
99+
Right _ -> assertFailure "Expected Left for invalid JSON"
100+
)
101+
102+
tests :: [Test]
103+
tests =
104+
[ TestLabel "jsonHappyPath" jsonHappyPath
105+
, TestLabel "jsonBoolColumn" jsonBoolColumn
106+
, TestLabel "jsonMissingKeyBecomesOptional" jsonMissingKeyBecomesOptional
107+
, TestLabel "jsonMixedTypeColumn" jsonMixedTypeColumn
108+
, TestLabel "jsonEmptyArray" jsonEmptyArray
109+
, TestLabel "jsonNonArray" jsonNonArray
110+
, TestLabel "jsonNonObjectElement" jsonNonObjectElement
111+
, TestLabel "jsonInvalidJSON" jsonInvalidJSON
112+
]

0 commit comments

Comments
 (0)