apache · coderfender · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 25, 2026
diff --git a/benchmarks/src/hj.rs b/benchmarks/src/hj.rs
@@ -25,8 +25,6 @@ use std::path::PathBuf;
 
 use futures::StreamExt;
 
-// TODO: Add existence joins
-
 /// Run the Hash Join benchmark
 ///
 /// This micro-benchmark focuses on the performance characteristics of Hash Joins.
@@ -303,6 +301,90 @@ const HASH_QUERIES: &[HashJoinQuery] = &[
         build_size: "100K_(20%_dups)",
         probe_size: "60M",
     },
+    // RightSemi Join benchmarks with Int32 keys
+    // Fanout: N/A for semi joins (returns at most one row per probe key)
+    //
+    // Q16: RightSemi, Small build (25 rows), 100% Hit rate
+    // Build Side: nation (25 rows) | Probe Side: customer (1.5M rows)
+    HashJoinQuery {
+        sql: r###"SELECT c.k
+        FROM (SELECT CAST(n_nationkey AS INT) as k FROM nation) n
+        RIGHT SEMI JOIN (SELECT CAST(c_nationkey AS INT) as k FROM customer) c
+        ON n.k = c.k"###,
+        density: 1.0,
+        prob_hit: 1.0,
+        build_size: "25",
+        probe_size: "1.5M_RightSemi",
+    },
+    // Q17: RightSemi, Medium build (100K rows), 100% Hit rate
+    // Build Side: supplier (100K rows) | Probe Side: lineitem (60M rows)
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s
+        RIGHT SEMI JOIN (SELECT CAST(l_suppkey AS INT) as k FROM lineitem) l
+        ON s.k = l.k"###,
+        density: 1.0,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M_RightSemi",
+    },
+    // Q18: RightSemi, Medium build (100K rows), 10% Hit rate
+    // Build Side: supplier (100K rows) | Probe Side: lineitem (60M rows)
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s
+        RIGHT SEMI JOIN (
+          SELECT CAST(CASE WHEN l_suppkey % 10 = 0 THEN l_suppkey ELSE l_suppkey + 1000000 END AS INT) as k
+          FROM lineitem
+        ) l
+        ON s.k = l.k"###,
+        density: 1.0,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M_RightSemi",
+    },
+    // RightAnti Join benchmarks with Int32 keys
+    // Fanout: N/A for anti joins (returns at most one row per probe key)
+    //
+    // Q19: RightAnti, Small build (25 rows), 100% Hit rate (no output)
+    // Build Side: nation (25 rows) | Probe Side: customer (1.5M rows)
+    HashJoinQuery {
+        sql: r###"SELECT c.k
+        FROM (SELECT CAST(n_nationkey AS INT) as k FROM nation) n
+        RIGHT ANTI JOIN (SELECT CAST(c_nationkey AS INT) as k FROM customer) c
+        ON n.k = c.k"###,
+        density: 1.0,
+        prob_hit: 1.0,
+        build_size: "25",
+        probe_size: "1.5M_RightAnti",
+    },
+    // Q20: RightAnti, Medium build (100K rows), 100% Hit rate (no output)
+    // Build Side: supplier (100K rows) | Probe Side: lineitem (60M rows)
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s
+        RIGHT ANTI JOIN (SELECT CAST(l_suppkey AS INT) as k FROM lineitem) l
+        ON s.k = l.k"###,
+        density: 1.0,
+        prob_hit: 1.0,
+        build_size: "100K",
+        probe_size: "60M_RightAnti",
+    },
+    // Q21: RightAnti, Medium build (100K rows), 10% Hit rate (90% output)
+    // Build Side: supplier (100K rows) | Probe Side: lineitem (60M rows)
+    HashJoinQuery {
+        sql: r###"SELECT l.k
+        FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s
+        RIGHT ANTI JOIN (
+          SELECT CAST(CASE WHEN l_suppkey % 10 = 0 THEN l_suppkey ELSE l_suppkey + 1000000 END AS INT) as k
+          FROM lineitem
+        ) l
+        ON s.k = l.k"###,
+        density: 1.0,
+        prob_hit: 0.1,
+        build_size: "100K",
+        probe_size: "60M_RightAnti",
+    },
 ];
 
 impl RunOpt {
@@ -323,7 +405,9 @@ impl RunOpt {
             None => 1..=HASH_QUERIES.len(),
         };
 
-        let config = self.common.config()?;
+        let mut config = self.common.config()?;
+        // Disable join reordering to ensure the optimizer doesn't swap join sides
+        config.options_mut().optimizer.join_reordering = false;
         let rt = self.common.build_runtime()?;
         let ctx = SessionContext::new_with_config_rt(config, rt);
 

diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml
@@ -112,3 +112,8 @@ required-features = ["test_utils"]
 harness = false
 name = "aggregate_vectorized"
 required-features = ["test_utils"]
+
+[[bench]]
+harness = false
+name = "hash_join_semi_anti"
+required-features = ["test_utils"]