From 8da47b0fe14d54e0e45f349307c5a2dac02a7f06 Mon Sep 17 00:00:00 2001
From: semyonsinchenko <ssinchenko@apache.org>
Date: Sat, 6 Sep 2025 20:17:48 +0200
Subject: [PATCH 1/4] Using of dataframe! macro in tests

---
 src/lib.rs    | 212 ++++++++++++++++++++++++++++++++++++++++----------
 src/pregel.rs |  33 ++------
 2 files changed, 179 insertions(+), 66 deletions(-)
diff --git a/src/lib.rs b/src/lib.rs
index 2590090..3b8d9ec 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,3 +1,4 @@
+mod connected_components;
 mod pagerank;
 mod pregel;
 mod shortest_paths;
@@ -7,13 +8,54 @@ use datafusion::error::Result;
 use datafusion::functions_aggregate::count::count;
 use datafusion::prelude::*;
 
+/// Column names for the vertex id column.
 pub const VERTEX_ID: &str = "id";
+/// Column names for the edge source column.
 pub const EDGE_SRC: &str = "src";
+/// Column names for the edge destination column.
 pub const EDGE_DST: &str = "dst";
+/// Column names for the edge column in triplet representation.
 pub const EDGE_COL: &str = "edge";
+/// Column names for the source vertex in triplet representation.
 pub const SRC_VERTEX: &str = "src_vertex";
+/// Column names for the destination vertex in triplet representation.
 pub const DST_VERTEX: &str = "dst_vertex";
 
+/// A data structure representing a graph in the form of vertices and edges.
+///
+/// The `GraphFrame` struct is designed to hold a graph's data where vertices
+/// (nodes) and edges (connections) are represented as `DataFrame` structures.
+///
+/// # Fields
+///
+/// * `vertices` - A `DataFrame` that contains information about the graph's vertices.
+///                Each row in the `DataFrame` represents a vertex (`VERTEX_ID`), and additional
+///                columns can store attributes (e.g., labels or properties) for
+///                each vertex.
+///
+/// * `edges` - A `DataFrame` that contains information about the graph's edges.
+///             Each row in the `DataFrame` represents an edge, with columns
+///             typically storing the source vertex (`EDGE_SRC`), destination vertex (`EDGE_DST`), and
+///             any additional attributes (e.g., weights or labels) associated
+///             with the edge.
+///
+/// # Example
+///
+/// ```
+/// use datafusion::dataframe;
+/// use graphframes_rs::{GraphFrame, VERTEX_ID, EDGE_SRC, EDGE_DST};
+/// let vertices = dataframe!(
+///   VERTEX_ID => vec![1i64, 2i64, 3i64],
+///   "attr" => vec!["a", "b", "c"]
+/// ).unwrap();
+/// let edges = dataframe!(
+///   EDGE_SRC => vec![1i64, 2i64, 3i64],
+///   EDGE_DST => vec![3i64, 1i64, 2i64],
+///   "attr" => vec!["d", "j", "h"]
+/// ).unwrap();
+///
+/// let graph = GraphFrame { vertices, edges };
+/// ```
 #[derive(Debug, Clone)]
 pub struct GraphFrame {
     pub vertices: DataFrame,
@@ -21,16 +63,98 @@ pub struct GraphFrame {
 }
 
 impl GraphFrame {
+    /// Returns the total number of nodes in the graph.
+    ///
+    /// # Returns
+    ///
+    /// This function returns a `Result<i64>`:
+    /// - `Ok(i64)`: The total number of nodes (vertices) in the graph, represented as a 64-bit signed integer.
+    /// - `Err`: If an error occurs during the computation or retrieval of the node count.
+    ///
+    /// # Example
+    ///
+    /// ```
+    /// use datafusion::dataframe;
+    /// use graphframes_rs::{GraphFrame, VERTEX_ID, EDGE_SRC, EDGE_DST};
+    /// let vertices = dataframe!(
+    ///   VERTEX_ID => vec![1i64, 2i64, 3i64],
+    ///   "attr" => vec!["a", "b", "c"]
+    /// ).unwrap();
+    /// let edges = dataframe!(
+    ///   EDGE_SRC => vec![1i64, 2i64, 3i64],
+    ///   EDGE_DST => vec![3i64, 1i64, 2i64],
+    ///   "attr" => vec!["d", "j", "h"]
+    /// ).unwrap();
+    ///
+    /// let graph = GraphFrame { vertices, edges };
+    /// let node_count = graph.num_nodes();
+    /// ```
     pub async fn num_nodes(&self) -> Result<i64> {
         let count = self.vertices.clone().count().await?;
         Ok(count as i64)
     }
 
+    /// Returns the total number of edges in the graph.
+    ///
+    /// # Returns
+    ///
+    /// This function returns a `Result<i64>`:
+    /// - `Ok(i64)` - The total number of edges, represented as a 64-bit integer.
+    /// - `Err(E)` - If an error occurs during the computation, the error is propagated.
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// use datafusion::dataframe;
+    /// use graphframes_rs::{GraphFrame, VERTEX_ID, EDGE_SRC, EDGE_DST};
+    /// let vertices = dataframe!(
+    ///   VERTEX_ID => vec![1i64, 2i64, 3i64],
+    ///   "attr" => vec!["a", "b", "c"]
+    /// ).unwrap();
+    /// let edges = dataframe!(
+    ///   EDGE_SRC => vec![1i64, 2i64, 3i64],
+    ///   EDGE_DST => vec![3i64, 1i64, 2i64],
+    ///   "attr" => vec!["d", "j", "h"]
+    /// ).unwrap();
+    ///
+    /// let graph = GraphFrame { vertices, edges };
+    /// let edge_count = graph.num_edges();
+    /// ```
     pub async fn num_edges(&self) -> Result<i64> {
         let count = self.edges.clone().count().await?;
         Ok(count as i64)
     }
 
+    /// Computes the in-degrees for each vertex in the graph.
+    ///
+    /// This function calculates the in-degree of each vertex by counting the number of
+    /// incoming edges. It returns a `DataFrame`
+    /// containing two columns:
+    /// - `VERTEX_ID`: The unique identifier of the vertex (derived from the destination of the edges).
+    /// - `in_degree`: The count of incoming edges (in-degrees) for each vertex.
+    ///
+    /// # Returns
+    /// An asynchronous function that returns:
+    /// - `Ok(DataFrame)` containing the vertex IDs and their corresponding in-degrees.
+    /// - `Err` if the aggregation or selection operation fails.
+    ///
+    /// # Example
+    /// ```rust
+    /// use datafusion::dataframe;
+    /// use graphframes_rs::{GraphFrame, VERTEX_ID, EDGE_SRC, EDGE_DST};
+    /// let vertices = dataframe!(
+    ///   VERTEX_ID => vec![1i64, 2i64, 3i64],
+    ///   "attr" => vec!["a", "b", "c"]
+    /// ).unwrap();
+    /// let edges = dataframe!(
+    ///   EDGE_SRC => vec![1i64, 2i64, 3i64],
+    ///   EDGE_DST => vec![3i64, 1i64, 2i64],
+    ///   "attr" => vec!["d", "j", "h"]
+    /// ).unwrap();
+    ///
+    /// let graph = GraphFrame { vertices, edges };
+    /// let edge_count = graph.in_degrees();
+    /// ```
     pub async fn in_degrees(&self) -> Result<DataFrame> {
         let df = self.edges.clone().aggregate(
             vec![col(EDGE_DST)],
@@ -38,7 +162,36 @@ impl GraphFrame {
         )?;
         Ok(df.select(vec![col(EDGE_DST).alias(VERTEX_ID), col("in_degree")])?)
     }
-
+    /// Computes the out-degrees for each vertex in the graph.
+    ///
+    /// This function calculates the out-degree of each vertex by counting the number of
+    /// outcoming edges. It returns a `DataFrame`
+    /// containing two columns:
+    /// - `VERTEX_ID`: The unique identifier of the vertex (derived from the destination of the edges).
+    /// - `in_degree`: The count of incoming edges (in-degrees) for each vertex.
+    ///
+    /// # Returns
+    /// An asynchronous function that returns:
+    /// - `Ok(DataFrame)` containing the vertex IDs and their corresponding in-degrees.
+    /// - `Err` if the aggregation or selection operation fails.
+    ///
+    /// # Example
+    /// ```rust
+    /// use datafusion::dataframe;
+    /// use graphframes_rs::{GraphFrame, VERTEX_ID, EDGE_SRC, EDGE_DST};
+    /// let vertices = dataframe!(
+    ///   VERTEX_ID => vec![1i64, 2i64, 3i64],
+    ///   "attr" => vec!["a", "b", "c"]
+    /// ).unwrap();
+    /// let edges = dataframe!(
+    ///   EDGE_SRC => vec![1i64, 2i64, 3i64],
+    ///   EDGE_DST => vec![3i64, 1i64, 2i64],
+    ///   "attr" => vec!["d", "j", "h"]
+    /// ).unwrap();
+    ///
+    /// let graph = GraphFrame { vertices, edges };
+    /// let edge_count = graph.in_degrees();
+    /// ```
     pub async fn out_degrees(&self) -> Result<DataFrame> {
         let df = self.edges.clone().aggregate(
             vec![col(EDGE_SRC)],
@@ -133,7 +286,6 @@ impl GraphFrame {
     /// let graph = GraphFrame { vertices, edges };
     /// let triplets = graph.triplets();
     /// ```
-    /// // Assuming `edges_df` and `vertices_df` are initialized DataFrames for
     pub async fn triplets(&self) -> Result<DataFrame> {
         let edges_struct = self.edges.clone().select(vec![
             col(EDGE_SRC),
@@ -189,46 +341,24 @@ impl GraphFrame {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use datafusion::arrow::array::{Int64Array, RecordBatch, StringArray};
-    use datafusion::arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
+    use datafusion::arrow::array::Int64Array;
+    use datafusion::arrow::datatypes::{DataType, Field, Fields};
     use std::collections::HashMap;
-    use std::sync::Arc;
-
-    fn create_test_graph() -> Result<GraphFrame> {
-        let ctx = SessionContext::new();
-
-        let vertices_data = RecordBatch::try_new(
-            SchemaRef::from(Schema::new(vec![
-                Field::new("id", DataType::Int64, false),
-                Field::new("name", DataType::Utf8, false),
-            ])),
-            vec![
-                Arc::new(Int64Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10])),
-                Arc::new(StringArray::from(vec![
-                    "Hub", "Alice", "Bob", "Carol", "David", "Eve", "Frank", "Grace", "Henry",
-                    "Ivy",
-                ])),
-            ],
-        );
-        let vertices = ctx.read_batch(vertices_data?)?;
-
-        let edges_data = RecordBatch::try_new(
-            SchemaRef::from(Schema::new(vec![
-                Field::new("src", DataType::Int64, false),
-                Field::new("dst", DataType::Int64, false),
-            ])),
-            vec![
-                Arc::new(Int64Array::from(vec![
-                    1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7,
-                    8, 8, 9, 10,
-                ])),
-                Arc::new(Int64Array::from(vec![
-                    2, 3, 4, 5, 6, 7, 8, 9, 10, 3, 4, 5, 6, 4, 5, 6, 5, 6, 7, 6, 7, 8, 7, 8, 8, 9,
-                    9, 10, 10, 1,
-                ])),
-            ],
-        );
-        let edges = ctx.read_batch(edges_data?)?;
+
+    pub(crate) fn create_test_graph() -> Result<GraphFrame> {
+        let vertices = dataframe!(
+            VERTEX_ID => vec![1i64, 2i64, 3i64, 4i64, 5i64, 6i64, 7i64, 8i64, 9i64, 10i64],
+            "name" => vec!["Hub", "Alice", "Bob", "Carol", "David", "Eve", "Frank", "Grace", "Henry", "Ivy"]
+        )?;
+
+        let edges = dataframe!(
+            EDGE_SRC => Vec::<i64>::from(
+                vec![1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10,]
+            ),
+            EDGE_DST => Vec::<i64>::from(
+                vec![2, 3, 4, 5, 6, 7, 8, 9, 10, 3, 4, 5, 6, 4, 5, 6, 5, 6, 7, 6, 7, 8, 7, 8, 8, 9, 9, 10, 10, 1,]
+            ),
+        )?;
 
         Ok(GraphFrame { vertices, edges })
     }
diff --git a/src/pregel.rs b/src/pregel.rs
index efbedf8..ffca023 100644
--- a/src/pregel.rs
+++ b/src/pregel.rs
@@ -413,36 +413,19 @@ impl GraphFrame {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use datafusion::arrow::array::{Array, Int32Array, Int64Array, RecordBatch};
-    use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef};
+    use datafusion::arrow::array::{Array, Int32Array, Int64Array};
     use datafusion::functions_aggregate::min_max::max;
     use datafusion::functions_aggregate::sum::sum;
-    use std::sync::Arc;
 
     fn create_graph(vertices: Vec<i64>, edges: Vec<Vec<i64>>) -> Result<GraphFrame> {
-        let ctx = SessionContext::new();
-
-        let vertices_data = RecordBatch::try_new(
-            SchemaRef::from(Schema::new(vec![Field::new("id", DataType::Int64, false)])),
-            vec![Arc::new(Int64Array::from(vertices))],
-        )?;
-        let vertices_df = ctx.read_batch(vertices_data)?;
-
-        let edges_data = RecordBatch::try_new(
-            SchemaRef::from(Schema::new(vec![
-                Field::new("src", DataType::Int64, false),
-                Field::new("dst", DataType::Int64, false),
-            ])),
-            vec![
-                Arc::new(Int64Array::from(
-                    edges.iter().map(|e| e[0]).collect::<Vec<i64>>(),
-                )),
-                Arc::new(Int64Array::from(
-                    edges.iter().map(|e| e[1]).collect::<Vec<i64>>(),
-                )),
-            ],
+        let vertices_df = dataframe!(
+            VERTEX_ID => Vec::<i64>::from(vertices),
         )?;
-        let edges_df = ctx.read_batch(edges_data)?;
+        let edges_df = dataframe!(EDGE_SRC => Vec::<i64>::from(
+            edges.iter().map(|e| e[0]).collect::<Vec<i64>>()
+        ), EDGE_DST => Vec::<i64>::from(
+            edges.iter().map(|e| e[1]).collect::<Vec<i64>>()
+        ))?;
 
         Ok(GraphFrame {
             vertices: vertices_df,

From a0ac9faf5041be68e1e781703b0537928d556738 Mon Sep 17 00:00:00 2001
From: semyonsinchenko <ssinchenko@apache.org>
Date: Mon, 8 Sep 2025 16:50:50 +0200
Subject: [PATCH 2/4] Add connected components benchmark and update
 dependencies

- Added `cc_benchmark.rs` for evaluating connected components performance.
- Updated `criterion` to v0.7 and refined benchmark configurations.
- Upgraded `datafusion` to v49.0.2.
- Modified `run_benchmarks.py` to include a `--weighted` graph option.
- Updated documentation and comments across the codebase for clarity.
---
 Cargo.lock                    | 221 ++++++++++++++++++++--------------
 Cargo.toml                    |  11 +-
 benches/README.md             |  10 +-
 benches/cc_benchmark.rs       |  42 +++++++
 benches/pagerank_benchmark.rs |  11 +-
 run_benchmarks.py             |  28 +++--
 src/connected_components.rs   |   4 +-
 src/util.rs                   |   2 +
 8 files changed, 217 insertions(+), 112 deletions(-)
 create mode 100644 benches/cc_benchmark.rs

diff --git a/Cargo.lock b/Cargo.lock
index aec0f0b..1b69ad6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -223,6 +223,7 @@ dependencies = [
  "arrow-schema",
  "flatbuffers",
  "lz4_flex",
+ "zstd",
 ]
 
 [[package]]
@@ -320,7 +321,7 @@ version = "0.4.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c"
 dependencies = [
- "bzip2",
+ "bzip2 0.5.2",
  "flate2",
  "futures-core",
  "memchr",
@@ -476,6 +477,15 @@ dependencies = [
  "bzip2-sys",
 ]
 
+[[package]]
+name = "bzip2"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bea8dcd42434048e4f7a304411d9273a411f647446c1234a65ce0554923f4cff"
+dependencies = [
+ "libbz2-rs-sys",
+]
+
 [[package]]
 name = "bzip2-sys"
 version = "0.1.13+1.0.8"
@@ -645,26 +655,22 @@ dependencies = [
 
 [[package]]
 name = "criterion"
-version = "0.5.1"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
+checksum = "e1c047a62b0cc3e145fa84415a3191f628e980b194c2755aa12300a4e6cbd928"
 dependencies = [
  "anes",
  "cast",
  "ciborium",
  "clap",
  "criterion-plot",
- "futures",
- "is-terminal",
- "itertools 0.10.5",
+ "itertools 0.13.0",
  "num-traits",
- "once_cell",
  "oorandom",
  "plotters",
  "rayon",
  "regex",
  "serde",
- "serde_derive",
  "serde_json",
  "tinytemplate",
  "tokio",
@@ -673,12 +679,12 @@ dependencies = [
 
 [[package]]
 name = "criterion-plot"
-version = "0.5.0"
+version = "0.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
+checksum = "9b1bcc0dc7dfae599d84ad0b1a55f80cde8af3725da8313b528da95ef783e338"
 dependencies = [
  "cast",
- "itertools 0.10.5",
+ "itertools 0.13.0",
 ]
 
 [[package]]
@@ -759,16 +765,16 @@ dependencies = [
 
 [[package]]
 name = "datafusion"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a11e19a7ccc5bb979c95c1dceef663eab39c9061b3bbf8d1937faf0f03bf41f"
+checksum = "69dfeda1633bf8ec75b068d9f6c27cdc392ffcf5ff83128d5dbab65b73c1fd02"
 dependencies = [
  "arrow",
  "arrow-ipc",
  "arrow-schema",
  "async-trait",
  "bytes",
- "bzip2",
+ "bzip2 0.6.0",
  "chrono",
  "datafusion-catalog",
  "datafusion-catalog-listing",
@@ -795,6 +801,7 @@ dependencies = [
  "datafusion-sql",
  "flate2",
  "futures",
+ "hex",
  "itertools 0.14.0",
  "log",
  "object_store",
@@ -813,9 +820,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-catalog"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "94985e67cab97b1099db2a7af11f31a45008b282aba921c1e1d35327c212ec18"
+checksum = "2848fd1e85e2953116dab9cc2eb109214b0888d7bbd2230e30c07f1794f642c0"
 dependencies = [
  "arrow",
  "async-trait",
@@ -839,9 +846,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-catalog-listing"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e002df133bdb7b0b9b429d89a69aa77b35caeadee4498b2ce1c7c23a99516988"
+checksum = "051a1634628c2d1296d4e326823e7536640d87a118966cdaff069b68821ad53b"
 dependencies = [
  "arrow",
  "async-trait",
@@ -862,16 +869,18 @@ dependencies = [
 
 [[package]]
 name = "datafusion-common"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e13242fc58fd753787b0a538e5ae77d356cb9d0656fa85a591a33c5f106267f6"
+checksum = "765e4ad4ef7a4500e389a3f1e738791b71ff4c29fd00912c2f541d62b25da096"
 dependencies = [
  "ahash",
  "arrow",
  "arrow-ipc",
  "base64",
+ "chrono",
  "half",
  "hashbrown 0.14.5",
+ "hex",
  "indexmap",
  "libc",
  "log",
@@ -886,9 +895,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-common-runtime"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d2239f964e95c3a5d6b4a8cde07e646de8995c1396a7fd62c6e784f5341db499"
+checksum = "40a2ae8393051ce25d232a6065c4558ab5a535c9637d5373bacfd464ac88ea12"
 dependencies = [
  "futures",
  "log",
@@ -897,15 +906,15 @@ dependencies = [
 
 [[package]]
 name = "datafusion-datasource"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2cf792579bc8bf07d1b2f68c2d5382f8a63679cce8fbebfd4ba95742b6e08864"
+checksum = "90cd841a77f378bc1a5c4a1c37345e1885a9203b008203f9f4b3a769729bf330"
 dependencies = [
  "arrow",
  "async-compression",
  "async-trait",
  "bytes",
- "bzip2",
+ "bzip2 0.6.0",
  "chrono",
  "datafusion-common",
  "datafusion-common-runtime",
@@ -933,9 +942,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-datasource-csv"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cfc114f9a1415174f3e8d2719c371fc72092ef2195a7955404cfe6b2ba29a706"
+checksum = "77f4a2c64939c6f0dd15b246723a699fa30d59d0133eb36a86e8ff8c6e2a8dc6"
 dependencies = [
  "arrow",
  "async-trait",
@@ -958,9 +967,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-datasource-json"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d88dd5e215c420a52362b9988ecd4cefd71081b730663d4f7d886f706111fc75"
+checksum = "11387aaf931b2993ad9273c63ddca33f05aef7d02df9b70fb757429b4b71cdae"
 dependencies = [
  "arrow",
  "async-trait",
@@ -983,9 +992,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-datasource-parquet"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "33692acdd1fbe75280d14f4676fe43f39e9cb36296df56575aa2cac9a819e4cf"
+checksum = "028f430c5185120bf806347848b8d8acd9823f4038875b3820eeefa35f2bb4a2"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1001,8 +1010,10 @@ dependencies = [
  "datafusion-physical-expr-common",
  "datafusion-physical-optimizer",
  "datafusion-physical-plan",
+ "datafusion-pruning",
  "datafusion-session",
  "futures",
+ "hex",
  "itertools 0.14.0",
  "log",
  "object_store",
@@ -1014,15 +1025,15 @@ dependencies = [
 
 [[package]]
 name = "datafusion-doc"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e0e7b648387b0c1937b83cb328533c06c923799e73a9e3750b762667f32662c0"
+checksum = "8ff336d1d755399753a9e4fbab001180e346fc8bfa063a97f1214b82274c00f8"
 
 [[package]]
 name = "datafusion-execution"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9609d83d52ff8315283c6dad3b97566e877d8f366fab4c3297742f33dcd636c7"
+checksum = "042ea192757d1b2d7dcf71643e7ff33f6542c7704f00228d8b85b40003fd8e0f"
 dependencies = [
  "arrow",
  "dashmap",
@@ -1039,11 +1050,12 @@ dependencies = [
 
 [[package]]
 name = "datafusion-expr"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e75230cd67f650ef0399eb00f54d4a073698f2c0262948298e5299fc7324da63"
+checksum = "025222545d6d7fab71e2ae2b356526a1df67a2872222cbae7535e557a42abd2e"
 dependencies = [
  "arrow",
+ "async-trait",
  "chrono",
  "datafusion-common",
  "datafusion-doc",
@@ -1060,9 +1072,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-expr-common"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "70fafb3a045ed6c49cfca0cd090f62cf871ca6326cc3355cb0aaf1260fa760b6"
+checksum = "9d5c267104849d5fa6d81cf5ba88f35ecd58727729c5eb84066c25227b644ae2"
 dependencies = [
  "arrow",
  "datafusion-common",
@@ -1073,9 +1085,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-functions"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "cdf9a9cf655265861a20453b1e58357147eab59bdc90ce7f2f68f1f35104d3bb"
+checksum = "c620d105aa208fcee45c588765483314eb415f5571cfd6c1bae3a59c5b4d15bb"
 dependencies = [
  "arrow",
  "arrow-buffer",
@@ -1102,9 +1114,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-functions-aggregate"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f07e49733d847be0a05235e17b884d326a2fd402c97a89fe8bcf0bfba310005"
+checksum = "35f61d5198a35ed368bf3aacac74f0d0fa33de7a7cb0c57e9f68ab1346d2f952"
 dependencies = [
  "ahash",
  "arrow",
@@ -1123,9 +1135,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-functions-aggregate-common"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4512607e10d72b0b0a1dc08f42cb5bd5284cb8348b7fea49dc83409493e32b1b"
+checksum = "13efdb17362be39b5024f6da0d977ffe49c0212929ec36eec550e07e2bc7812f"
 dependencies = [
  "ahash",
  "arrow",
@@ -1136,9 +1148,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-functions-nested"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ab331806e34f5545e5f03396e4d5068077395b1665795d8f88c14ec4f1e0b7a"
+checksum = "9187678af567d7c9e004b72a0b6dc5b0a00ebf4901cb3511ed2db4effe092e66"
 dependencies = [
  "arrow",
  "arrow-ord",
@@ -1148,6 +1160,7 @@ dependencies = [
  "datafusion-expr",
  "datafusion-functions",
  "datafusion-functions-aggregate",
+ "datafusion-functions-aggregate-common",
  "datafusion-macros",
  "datafusion-physical-expr-common",
  "itertools 0.14.0",
@@ -1157,9 +1170,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-functions-table"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d4ac2c0be983a06950ef077e34e0174aa0cb9e346f3aeae459823158037ade37"
+checksum = "ecf156589cc21ef59fe39c7a9a841b4a97394549643bbfa88cc44e8588cf8fe5"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1173,9 +1186,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-functions-window"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "36f3d92731de384c90906941d36dcadf6a86d4128409a9c5cd916662baed5f53"
+checksum = "edcb25e3e369f1366ec9a261456e45b5aad6ea1c0c8b4ce546587207c501ed9e"
 dependencies = [
  "arrow",
  "datafusion-common",
@@ -1191,9 +1204,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-functions-window-common"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c679f8bf0971704ec8fd4249fcbb2eb49d6a12cc3e7a840ac047b4928d3541b5"
+checksum = "8996a8e11174d0bd7c62dc2f316485affc6ae5ffd5b8a68b508137ace2310294"
 dependencies = [
  "datafusion-common",
  "datafusion-physical-expr-common",
@@ -1201,9 +1214,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-macros"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2821de7cb0362d12e75a5196b636a59ea3584ec1e1cc7dc6f5e34b9e8389d251"
+checksum = "95ee8d1be549eb7316f437035f2cec7ec42aba8374096d807c4de006a3b5d78a"
 dependencies = [
  "datafusion-expr",
  "quote",
@@ -1212,14 +1225,15 @@ dependencies = [
 
 [[package]]
 name = "datafusion-optimizer"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1594c7a97219ede334f25347ad8d57056621e7f4f35a0693c8da876e10dd6a53"
+checksum = "c9fa98671458254928af854e5f6c915e66b860a8bde505baea0ff2892deab74d"
 dependencies = [
  "arrow",
  "chrono",
  "datafusion-common",
  "datafusion-expr",
+ "datafusion-expr-common",
  "datafusion-physical-expr",
  "indexmap",
  "itertools 0.14.0",
@@ -1231,9 +1245,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-physical-expr"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc6da0f2412088d23f6b01929dedd687b5aee63b19b674eb73d00c3eb3c883b7"
+checksum = "3515d51531cca5f7b5a6f3ea22742b71bb36fc378b465df124ff9a2fa349b002"
 dependencies = [
  "ahash",
  "arrow",
@@ -1253,9 +1267,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-physical-expr-common"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dcb0dbd9213078a593c3fe28783beaa625a4e6c6a6c797856ee2ba234311fb96"
+checksum = "24485475d9c618a1d33b2a3dad003d946dc7a7bbf0354d125301abc0a5a79e3e"
 dependencies = [
  "ahash",
  "arrow",
@@ -1267,9 +1281,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-physical-optimizer"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6d140854b2db3ef8ac611caad12bfb2e1e1de827077429322a6188f18fc0026a"
+checksum = "b9da411a0a64702f941a12af2b979434d14ec5d36c6f49296966b2c7639cbb3a"
 dependencies = [
  "arrow",
  "datafusion-common",
@@ -1279,6 +1293,7 @@ dependencies = [
  "datafusion-physical-expr",
  "datafusion-physical-expr-common",
  "datafusion-physical-plan",
+ "datafusion-pruning",
  "itertools 0.14.0",
  "log",
  "recursive",
@@ -1286,9 +1301,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-physical-plan"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b46cbdf21a01206be76d467f325273b22c559c744a012ead5018dfe79597de08"
+checksum = "a6d168282bb7b54880bb3159f89b51c047db4287f5014d60c3ef4c6e1468212b"
 dependencies = [
  "ahash",
  "arrow",
@@ -1314,11 +1329,29 @@ dependencies = [
  "tokio",
 ]
 
+[[package]]
+name = "datafusion-pruning"
+version = "49.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "391a457b9d23744c53eeb89edd1027424cba100581488d89800ed841182df905"
+dependencies = [
+ "arrow",
+ "arrow-schema",
+ "datafusion-common",
+ "datafusion-datasource",
+ "datafusion-expr-common",
+ "datafusion-physical-expr",
+ "datafusion-physical-expr-common",
+ "datafusion-physical-plan",
+ "itertools 0.14.0",
+ "log",
+]
+
 [[package]]
 name = "datafusion-session"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3a72733766ddb5b41534910926e8da5836622316f6283307fd9fb7e19811a59c"
+checksum = "053201c2bb729c7938f85879034df2b5a52cfaba16f1b3b66ab8505c81b2aad3"
 dependencies = [
  "arrow",
  "async-trait",
@@ -1340,9 +1373,9 @@ dependencies = [
 
 [[package]]
 name = "datafusion-sql"
-version = "48.0.1"
+version = "49.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c5162338cdec9cc7ea13a0e6015c361acad5ec1d88d83f7c86301f789473971f"
+checksum = "9082779be8ce4882189b229c0cff4393bd0808282a7194130c9f32159f185e25"
 dependencies = [
  "arrow",
  "bigdecimal",
@@ -1559,8 +1592,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
 dependencies = [
  "cfg-if",
+ "js-sys",
  "libc",
  "wasi 0.11.1+wasi-snapshot-preview1",
+ "wasm-bindgen",
 ]
 
 [[package]]
@@ -1628,12 +1663,6 @@ dependencies = [
  "foldhash",
 ]
 
-[[package]]
-name = "hermit-abi"
-version = "0.5.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c"
-
 [[package]]
 name = "hex"
 version = "0.4.3"
@@ -1815,22 +1844,11 @@ dependencies = [
  "libc",
 ]
 
-[[package]]
-name = "is-terminal"
-version = "0.4.16"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9"
-dependencies = [
- "hermit-abi",
- "libc",
- "windows-sys 0.59.0",
-]
-
 [[package]]
 name = "itertools"
-version = "0.10.5"
+version = "0.13.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
+checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186"
 dependencies = [
  "either",
 ]
@@ -1934,6 +1952,12 @@ dependencies = [
  "static_assertions",
 ]
 
+[[package]]
+name = "libbz2-rs-sys"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7"
+
 [[package]]
 name = "libc"
 version = "0.2.174"
@@ -2217,6 +2241,7 @@ dependencies = [
  "num-bigint",
  "object_store",
  "paste",
+ "ring",
  "seq-macro",
  "simdutf8",
  "snap",
@@ -2472,6 +2497,20 @@ version = "0.8.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
 
+[[package]]
+name = "ring"
+version = "0.17.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
+dependencies = [
+ "cc",
+ "cfg-if",
+ "getrandom 0.2.16",
+ "libc",
+ "untrusted",
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "rustc-demangle"
 version = "0.1.25"
@@ -2890,6 +2929,12 @@ version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c"
 
+[[package]]
+name = "untrusted"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
+
 [[package]]
 name = "url"
 version = "2.5.4"
diff --git a/Cargo.toml b/Cargo.toml
index 5f64ef8..c4125ad 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,18 +9,17 @@ exclude = [
 ]
 
 [dependencies]
-datafusion = "48.0.1"
+datafusion = "49.0.2"
 tokio = {version = "1"}
 
 [dev-dependencies]
-criterion = { version = "0.5", features = ["html_reports", "async_tokio"] }
+criterion = { version = "0.7", features = ["html_reports", "async_tokio"] }
 tokio = { version = "1", features = ["full"] }
 
 [[bench]]
 name = "pagerank_benchmark"
 harness = false # To disable Rust's default benchmarking and use the Criterion one
 
-# Adding more benchmarks
-# [[bench]]
-# name = "shortestdistance_benchmark"
-# harness = false
\ No newline at end of file
+[[bench]]
+name = "cc_benchmark"
+harness = false
\ No newline at end of file
diff --git a/benches/README.md b/benches/README.md
index 0f5acb4..076b7ba 100644
--- a/benches/README.md
+++ b/benches/README.md
@@ -1,9 +1,9 @@
-# Running Benchmarks for Graphframe-rs
+# Running Benchmarks for graphframes-rs
 
-Benchmarking for Graphframe-rs are currently done on LDBC Graphalytics [datasets](https://ldbcouncil.org/benchmarks/graphalytics/datasets/).
+Benchmarking for graphframes-rs is currently done on LDBC Graphalytics [datasets](https://ldbcouncil.org/benchmarks/graphalytics/datasets/).
 Benchmarking runs and reports are executed/generated as html-reports using Rust Criterion crate.
 
-## How to run benchmarks ?
+## How to run benchmarks?
 
 `run_benchmarks.py` file is the main source for running the benchmarks.
 
@@ -24,9 +24,9 @@ CLI utility:
 
 ### Parameters for `run_benchmarks.py`
 
-- `--dataset`: [MANDATORY] LDBC dataset name on which user want to run the benchmark (for e.g. test-pr-directed, cit-Patents). Dataset name are exactly same as mentioned in LDBC website.
+- `--dataset`: LDBC dataset name on which user want to run the benchmark (for e.g., test-pr-directed, cit-Patents). Dataset name is exactly the same as mentioned in LDBC website. Default is wiki-Talk.
 - `--checkpoint_interval`: If user wants to define a specific number of checkpoints for Algorithms to run on. `default: 1`
-- `--name`: If a particular benchmark needs to run. Name should be same as the `[[bench]]` names present in `Cargo.toml`
+- `--name`: [MANDATORY] If a particular benchmark needs to run. Name should be same as the `[[bench]]` names present in `Cargo.toml`
 
 ```bash
 # Running all the benchmarks
diff --git a/benches/cc_benchmark.rs b/benches/cc_benchmark.rs
new file mode 100644
index 0000000..8dc9b9e
--- /dev/null
+++ b/benches/cc_benchmark.rs
@@ -0,0 +1,42 @@
+use criterion::{Criterion, criterion_group, criterion_main};
+use graphframes_rs::util::create_ldbc_test_graph;
+use std::env;
+use tokio::runtime::Runtime;
+
+fn benchmark_cc(c: &mut Criterion) {
+    let dataset_name =
+        env::var("BENCHMARK_DATASET").expect("BENCHMARK_DATASET environment variable not set");
+    let is_weighted = match env::var("WEIGHTED").expect("WEIGHTED environment variable not set") {
+        s if s == "true" => true,
+        _ => false,
+    };
+
+    let mut group = c.benchmark_group("Connected Components");
+    group.sample_size(10);
+    group.measurement_time(std::time::Duration::from_secs(200));
+
+    // Create a Tokio runtime to execute the async graph loading function.
+    let rt = Runtime::new().unwrap();
+
+    // Load the graph data once before running the benchmark.
+    let graph = rt
+        .block_on(create_ldbc_test_graph(&dataset_name, true, is_weighted))
+        .expect("Failed to create test graph");
+
+    // Creating cc_builder here so to exclude the time of generation in each iteration
+    let cc_builder = graph.connected_components();
+
+    // Define the benchmark.
+    // Criterion runs the code inside the closure many times to get a reliable measurement.
+    group.bench_function(String::from("cc-".to_owned() + &dataset_name), |b| {
+        // Use the `to_async` adapter to benchmark an async function.
+        b.to_async(&rt).iter(|| async {
+            let _ = cc_builder.clone().run().await.unwrap().data.collect().await;
+        })
+    });
+
+    group.finish();
+}
+
+criterion_group!(benches, benchmark_cc);
+criterion_main!(benches);
diff --git a/benches/pagerank_benchmark.rs b/benches/pagerank_benchmark.rs
index d32aa35..b3f6faa 100644
--- a/benches/pagerank_benchmark.rs
+++ b/benches/pagerank_benchmark.rs
@@ -1,4 +1,4 @@
-use criterion::{Criterion, criterion_group, criterion_main};
+use criterion::{criterion_group, criterion_main, Criterion};
 use graphframes_rs::util::create_ldbc_test_graph;
 use std::env;
 use tokio::runtime::Runtime;
@@ -12,14 +12,21 @@ fn benchmark_pagerank(c: &mut Criterion) {
         .parse()
         .expect("CHECKPOINT_INTERVAL is not a valid int");
 
+    let is_weighted = match env::var("WEIGHTED").expect("WEIGHTED environment variable not set") {
+        s if s == "true" => true,
+        _ => false,
+    };
+
     let mut group = c.benchmark_group("PageRank");
+    group.sample_size(10);
+    group.measurement_time(std::time::Duration::from_secs(200));
 
     // Create a Tokio runtime to execute the async graph loading function.
     let rt = Runtime::new().unwrap();
 
     // Load the graph data once before running the benchmark.
     let graph = rt
-        .block_on(create_ldbc_test_graph(&dataset_name, true, false))
+        .block_on(create_ldbc_test_graph(&dataset_name, true, is_weighted))
         .expect("Failed to create test graph");
 
     // Creating pagerank_builder here so to exclude the time of generation in each iteration
diff --git a/run_benchmarks.py b/run_benchmarks.py
index 91405c2..6fb11a4 100644
--- a/run_benchmarks.py
+++ b/run_benchmarks.py
@@ -1,12 +1,13 @@
 import argparse
 import os
 import pathlib
-import requests
+import shutil
 import subprocess
 import sys
-import shutil
 import time
 
+import requests
+
 # The base URL for downloading Graphalytics datasets.
 BASE_URL = "https://datasets.ldbcouncil.org/graphalytics"
 
@@ -27,7 +28,7 @@ def prepare_dataset(dataset_name: str):
         print(f"Dataset '{dataset_name}' is ready.")
         return
 
-    # make dataset_dir if doesn't exist
+    # make dataset_dir if it doesn't exist
     os.mkdir(dataset_dir)
 
     # If the archive doesn't exist, download it.
@@ -97,7 +98,7 @@ def prepare_dataset(dataset_name: str):
         for dirpath, _, filenames in os.walk(dataset_dir):
             for filename in filenames:
                 if (not filename.endswith(".properties")) and (
-                    not filename.endswith(".tar.zst")
+                        not filename.endswith(".tar.zst")
                 ):
                     old_path = pathlib.Path(dirpath) / filename
                     new_path = old_path.with_name(f"{old_path.name}.csv")
@@ -118,7 +119,7 @@ def prepare_dataset(dataset_name: str):
         sys.exit(1)
 
 
-def run_benchmarks(dataset_name: str, checkpoint_interval: int, benchmark_name: str):
+def run_benchmarks(dataset_name: str, checkpoint_interval: int, benchmark_name: str, is_weighted: str = "false"):
     """
     Runs the Rust benchmarks using 'cargo bench', passing the dataset name
     as an environment variable.
@@ -128,7 +129,8 @@ def run_benchmarks(dataset_name: str, checkpoint_interval: int, benchmark_name:
     # Set the dataset name in an environment variable for the benchmark process.
     env = os.environ.copy()
     env["BENCHMARK_DATASET"] = dataset_name
-    env["CHECKPOINT_INTERVAL"] = checkpoint_interval
+    env["CHECKPOINT_INTERVAL"] = str(checkpoint_interval)
+    env["WEIGHTED"] = is_weighted
 
     # Execute 'cargo bench' and stream its output.
     try:
@@ -181,7 +183,8 @@ def main():
     parser.add_argument(
         "--dataset",
         type=str,
-        required=True,
+        default="wiki-Talk",
+        required=False,
         help="The name of the Graphalytics dataset to download and use for benchmarking (e.g., 'test-pr-directed').",
     )
     parser.add_argument(
@@ -194,9 +197,16 @@ def main():
     parser.add_argument(
         "--name",
         type=str,
-        required=False,
+        required=True,
         help="Name of the benchmark that needs to run.",
     )
+    parser.add_argument(
+        "--weighted",
+        type=str,
+        required=False,
+        default="false",
+        help="Whether the graph is weighted or not.",
+    )
     args = parser.parse_args()
     dataset = args.dataset
     checkpoint_interval = args.checkpoint_interval
@@ -206,7 +216,7 @@ def main():
     BENCH_DATA_DIR.mkdir(parents=True, exist_ok=True)
 
     prepare_dataset(dataset)
-    run_benchmarks(dataset, checkpoint_interval, benchmark_name)
+    run_benchmarks(dataset, checkpoint_interval, benchmark_name, is_weighted=args.weighted)
 
 
 if __name__ == "__main__":
diff --git a/src/connected_components.rs b/src/connected_components.rs
index 27077e6..cf7b607 100644
--- a/src/connected_components.rs
+++ b/src/connected_components.rs
@@ -63,14 +63,14 @@ async fn min_nbr_sum(min_neighbours: &DataFrame) -> Result<i128> {
         .map(|a| a.value(0))
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct ConnectedComponentsOutput {
     pub data: DataFrame,
     pub num_iterations: usize,
     pub min_nbr_sum: Vec<i128>,
 }
 
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct ConnectedComponentsBuilder<'a> {
     graph_frame: &'a GraphFrame,
 }
diff --git a/src/util.rs b/src/util.rs
index a58a0c3..1e426dc 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -79,6 +79,8 @@ pub async fn create_ldbc_test_graph(
                 .schema(&vertices_schema),
         )
         .await?;
+    println!("read {} vertices", vertices.clone().count().await?);
+    println!("read {} edges", edges.clone().count().await?);
     Ok(GraphFrame { vertices, edges })
 }
 

From 4eab64d6eb89fb25b39e266858adedb280e88995 Mon Sep 17 00:00:00 2001
From: semyonsinchenko <ssinchenko@apache.org>
Date: Mon, 8 Sep 2025 18:40:10 +0200
Subject: [PATCH 3/4] Add shortest paths benchmark

- Introduced `sp_benchmark.rs` for benchmarking shortest path algorithms.
- Added `Clone` and `Debug` traits to `ShortestPathsBuilder`.
- Updated `Cargo.toml` to include the new benchmark configuration.
---
 Cargo.toml                    |  4 +++
 benches/pagerank_benchmark.rs |  2 +-
 benches/sp_benchmark.rs       | 47 +++++++++++++++++++++++++++++++++++
 src/shortest_paths.rs         |  1 +
 4 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 benches/sp_benchmark.rs

diff --git a/Cargo.toml b/Cargo.toml
index c4125ad..7e8422a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -22,4 +22,8 @@ harness = false # To disable Rust's default benchmarking and use the Criterion o
 
 [[bench]]
 name = "cc_benchmark"
+harness = false
+
+[[bench]]
+name = "sp_benchmark"
 harness = false
\ No newline at end of file
diff --git a/benches/pagerank_benchmark.rs b/benches/pagerank_benchmark.rs
index b3f6faa..e7d5491 100644
--- a/benches/pagerank_benchmark.rs
+++ b/benches/pagerank_benchmark.rs
@@ -1,4 +1,4 @@
-use criterion::{criterion_group, criterion_main, Criterion};
+use criterion::{Criterion, criterion_group, criterion_main};
 use graphframes_rs::util::create_ldbc_test_graph;
 use std::env;
 use tokio::runtime::Runtime;
diff --git a/benches/sp_benchmark.rs b/benches/sp_benchmark.rs
new file mode 100644
index 0000000..0d99e65
--- /dev/null
+++ b/benches/sp_benchmark.rs
@@ -0,0 +1,47 @@
+use criterion::{Criterion, criterion_group, criterion_main};
+use graphframes_rs::util::create_ldbc_test_graph;
+use std::env;
+use tokio::runtime::Runtime;
+
+fn benchmark_sp(c: &mut Criterion) {
+    let dataset_name =
+        env::var("BENCHMARK_DATASET").expect("BENCHMARK_DATASET environment variable not set");
+    let checkpoint_interval: usize = env::var("CHECKPOINT_INTERVAL")
+        .expect("BENCHMARK_DATASET environment variable not set")
+        .parse()
+        .expect("CHECKPOINT_INTERVAL is not a valid int");
+
+    let is_weighted = match env::var("WEIGHTED").expect("WEIGHTED environment variable not set") {
+        s if s == "true" => true,
+        _ => false,
+    };
+    let mut group = c.benchmark_group("ShortestPath");
+    group.sample_size(10);
+    group.measurement_time(std::time::Duration::from_secs(200));
+
+    let rt = Runtime::new().unwrap();
+    let graph = rt
+        .block_on(create_ldbc_test_graph(&dataset_name, true, is_weighted))
+        .expect("Failed to create test graph");
+
+    let sp_builder = graph
+        .shortest_paths(vec![2i64]) // TODO: replace to read from props
+        .checkpoint_interval(checkpoint_interval);
+
+    group.bench_function(
+        String::from(
+            "sp-".to_owned() + &dataset_name + "-cp-" + &checkpoint_interval.to_string(),
+        ),
+        |b| {
+            // Use the `to_async` adapter to benchmark an async function.
+            b.to_async(&rt).iter(|| async {
+                let _ = sp_builder.clone().run().await.unwrap().collect().await;
+            })
+        },
+    );
+
+    group.finish();
+}
+
+criterion_group!(benches, benchmark_sp);
+criterion_main!(benches);
diff --git a/src/shortest_paths.rs b/src/shortest_paths.rs
index e390b45..2e399bd 100644
--- a/src/shortest_paths.rs
+++ b/src/shortest_paths.rs
@@ -105,6 +105,7 @@ impl Accumulator for DistancesMap {
 ///
 /// This builder helps configure and execute a Pregel algorithm that computes the shortest paths
 /// from all vertices in the graph to a specified set of landmark vertices.
+#[derive(Debug, Clone)]
 pub struct ShortestPathsBuilder<'a> {
     /// Reference to the graph frame containing vertices and edges
     graph_frame: &'a GraphFrame,

From fbf558e292735d7778a8a6a366eacc34b7b1d6c2 Mon Sep 17 00:00:00 2001
From: semyonsinchenko <ssinchenko@apache.org>
Date: Mon, 8 Sep 2025 18:41:24 +0200
Subject: [PATCH 4/4] Refactor shortest paths benchmark string construction

---
 benches/sp_benchmark.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/benches/sp_benchmark.rs b/benches/sp_benchmark.rs
index 0d99e65..535f75d 100644
--- a/benches/sp_benchmark.rs
+++ b/benches/sp_benchmark.rs
@@ -29,9 +29,7 @@ fn benchmark_sp(c: &mut Criterion) {
         .checkpoint_interval(checkpoint_interval);
 
     group.bench_function(
-        String::from(
-            "sp-".to_owned() + &dataset_name + "-cp-" + &checkpoint_interval.to_string(),
-        ),
+        String::from("sp-".to_owned() + &dataset_name + "-cp-" + &checkpoint_interval.to_string()),
         |b| {
             // Use the `to_async` adapter to benchmark an async function.
             b.to_async(&rt).iter(|| async {