PopovIILab · iliapopov17 · Jul 17, 2025 · Jun 9, 2025 · Jun 23, 2025 · Jun 23, 2025
diff --git a/README.md b/README.md
@@ -38,11 +38,10 @@ X9,0,3,2,16,7,1,23,12,10,9,1,2,134,40,390,289,29,372,27,81,150,90,9,88,32,287,88
 
 ### Relative abundance output
 
-`counts_phylum.csv` parsed from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+`ra_phylum.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
 
 ```
 Sample_id,taxon,rel_abund_perc
-Sample_id,taxon,rel_abund_perc
 X1,Pseudomonadota,85.03558294577552
 X1,Bacillota,10.72121619814011
 X1,Other (<4.0%),4.243200856084384
@@ -59,6 +58,43 @@ X9,Bacillota,12.473649123439218
 X9,Other (<4.0%),1.8979510606688494
 ```
 
+### α-diversity output
+
+`alpha_div.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+
+```
+Sample,Shannon,Pielou,Chao1
+X1,3.911345447107001,0.5269245043289149,2274.533185840708
+X2,3.9944130792536563,0.4906424221265042,4155.0
+...
+X8,3.442077115880119,0.42753293021330063,4177.251358695652
+X9,4.033664950188261,0.5050385978575492,3492.16
+```
+
+### β-diversity output
+
+`beta_div_bray.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+
+```
+,X1,X2,...,X8,X9
+X1,0.0,0.398,...,0.61,0.353
+X2,0.398,0.0,...,0.723,0.388
+...
+X8,0.61,0.723,...,0.0,0.665
+X9,0.353,0.388,...,0.665,0.0
+```
+
+`beta_div_jaccard.csv` calculated from 7 kraken2 reports of metagenomic samples using `KrakenParser`:
+
+```
+,X1,X2,...,X8,X9
+X1,0.0,0.7073170731707317,...,0.8223938223938224,0.7232472324723247
+X2,0.7073170731707317,0.0,...,0.835016835016835,0.7352941176470589
+...
+X8,0.8223938223938224,0.835016835016835,...,0.0,0.8066914498141264
+X9,0.7232472324723247,0.7352941176470589,...,0.8066914498141264,0.0
+```
+
 ### Visualization examples gallery
 
 |[Stacked Barplot](https://github.com/PopovIILab/KrakenParser/wiki/Stacked-Barplot-API)|[Streamgraph](https://github.com/PopovIILab/KrakenParser/wiki/Streamgraph-API)|
@@ -82,6 +118,7 @@ This will:
 4. Process extracted text files
 5. Convert them into CSV format
 6. Calculate relative abundance
+7. Calculate α & β-diversities
 
 ### **Input Requirements**
 - The Kraken2 reports must be inside a **subdirectory** (e.g., `data/kreports`).
@@ -160,6 +197,22 @@ KrakenParser --relabund -i data/counts/csv/counts_phylum.csv -o data/counts/csv_
 
 This will group all the taxa that have abundance <3.5 into "Other <3.5%" group. Other parameters are welcome!
 
+### **Step 7: Calculate α & β-diversities**
+```bash
+KrakenParser --diversity -i data/counts/csv/counts_species.csv -o data/diversity
+#Having troubles? Run KrakenParser --diversity -h
+```
+
+This calculates α & β-diversities and saves them as CSV format to directory provided in the output.
+
+If user wants to use another depth for β-diversity calculations:
+```bash
+KrakenParser --diversity -i data/counts/csv/counts_species.csv -o data/diversity --depth 750
+#Having troubles? Run KrakenParser --diversity -h
+```
+
+Other parameters are welcome!
+
 ## Arguments Breakdown
 ### **KrakenParser** (Main Pipeline)
 - Automates the entire workflow.
@@ -190,29 +243,39 @@ This will group all the taxa that have abundance <3.5 into "Other <3.5%" group.
 - Calculates relative abundance based on total abundance CSV.
 - Optionally can group low abundant taxa.
 
+### **--diversity** (Step 7)
+- Calculates α & β-diversities based on total species abundance CSV.
+- Shannon, Pielou & Chao1 indices for α-diversity
+- Bray-Curtis & Jaccard indices for β-diversity
+- Uses 1000 depth for β-diversity as default (can be adjusted with -d)
+
 ## Example Output Structure
 After running the full pipeline, the output directory will look like this:
 ```
 data/
-├─ kreports/           # Input Kraken2 reports
-├─ mpa/                # Converted MPA files
-├─ COMBINED.txt        # Merged MPA file
-└─ counts/
-   ├─ txt/             # Extracted taxonomic levels in TXT
-   │  ├─ counts_species.txt
-   │  ├─ counts_genus.txt
-   │  ├─ counts_family.txt
-   │  ├─ ...
-   └─ csv/             # Total abundance CSV output
-   │  ├─ counts_species.csv
-   │  ├─ counts_genus.csv
-   │  ├─ counts_family.csv
-   │  ├─ ...
-   └─ csv_relabund/    # Relative abundance CSV output
-      ├─ counts_species.csv
-      ├─ counts_genus.csv
-      ├─ counts_family.csv
-      ├─ ...
+├─ kreports/               # Input Kraken2 reports
+├─ mpa/                    # Converted MPA files
+├─ COMBINED.txt            # Merged MPA file
+├─ counts/
+│  ├─ txt/                 # Extracted taxonomic levels in TXT
+│  │  ├─ counts_species.txt
+│  │  ├─ counts_genus.txt
+│  │  ├─ counts_family.txt
+│  │  ├─ ...
+│  └─ csv/                 # Total abundance CSV output
+│  │  ├─ counts_species.csv
+│  │  ├─ counts_genus.csv
+│  │  ├─ counts_family.csv
+│  │  ├─ ...
+├─ rel_abund/              # Relative abundance CSV output
+│  ├─ ra_species.csv
+│  ├─ ra_genus.csv
+│  ├─ ra_family.csv
+│  ├─ ...
+└─ diversity/
+   ├─ alpha_div.csv
+   ├─ beta_div_bray.csv
+   └─ beta_div_jaccard.csv
 ```
 
 ## Conclusion

diff --git a/krakenparser/decombine.sh b/krakenparser/decombine.sh
@@ -55,7 +55,6 @@ fi
 # Create destination directories
 mkdir -p "${DESTINATION_DIR}/txt"
 mkdir -p "${DESTINATION_DIR}/csv"
-mkdir -p "${DESTINATION_DIR}/csv_relabund"
 
 # Process input file and generate output files
 grep -E "s__" "${SOURCE_FILE}" \

diff --git a/krakenparser/diversity.py b/krakenparser/diversity.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python
+
+import pandas as pd
+import numpy as np
+import sys
+import shutil
+import argparse
+from pathlib import Path
+from skbio.diversity import beta_diversity
+from skbio.stats import subsample_counts
+
+
+# Define Shannon index
+def shannon_index(counts):
+    counts = np.array(counts)
+    counts = counts[counts > 0]
+    proportions = counts / counts.sum()
+    return -np.sum(proportions * np.log(proportions))
+
+
+# Define Pielou's evenness
+def pielou_evenness(counts):
+    counts = np.array(counts)
+    counts = counts[counts > 0]
+    H = shannon_index(counts)
+    S = len(counts)
+    return H / np.log(S) if S > 1 else 0
+
+
+# Define Chao1 richness estimator
+def chao1_index(counts):
+    counts = np.array(counts)
+    S_obs = np.sum(counts > 0)
+    F1 = np.sum(counts == 1)
+    F2 = np.sum(counts == 2)
+    if F2 == 0:
+        return S_obs + F1 * (F1 - 1) / 2
+    return S_obs + (F1 * F1) / (2 * F2)
+
+
+def calc_alpha_div(df, output_path):
+    results = []
+    for sample_id, row in df.iterrows():
+        counts = row.values
+        results.append(
+            {
+                "Sample": sample_id,
+                "Shannon": shannon_index(counts),
+                "Pielou": pielou_evenness(counts),
+                "Chao1": chao1_index(counts),
+            }
+        )
+    alpha_df = pd.DataFrame(results).set_index("Sample")
+    alpha_df.to_csv(output_path / "alpha_div.csv")
+
+
+def calc_beta_div(df, output_path, rarefaction_depth):
+    rarefied_counts = []
+    sample_ids = []
+
+    for sample, row in df.iterrows():
+        counts = row.values.astype(int)
+        if counts.sum() >= rarefaction_depth:
+            rarefied = subsample_counts(counts, n=rarefaction_depth)
+            rarefied_counts.append(rarefied)
+            sample_ids.append(sample)
+
+    if len(rarefied_counts) < 2:
+        raise ValueError("Not enough samples passed the rarefaction threshold.")
+
+    bray_df = beta_diversity(
+        "braycurtis", rarefied_counts, ids=sample_ids
+    ).to_data_frame()
+    jaccard_df = beta_diversity(
+        "jaccard", rarefied_counts, ids=sample_ids
+    ).to_data_frame()
+
+    bray_df.to_csv(output_path / "beta_div_bray.csv")
+    jaccard_df.to_csv(output_path / "beta_div_jaccard.csv")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Calculate α & β-diversities.")
+    parser.add_argument(
+        "-i",
+        "--input",
+        required=True,
+        help="Input total count table CSV (species level).",
+    )
+    parser.add_argument("-o", "--output", required=True, help="Output directory path.")
+    parser.add_argument(
+        "-d",
+        "--depth",
+        type=int,
+        default=1000,
+        help="Rarefaction depth for β diversity (default: 1000).",
+    )
+    args = parser.parse_args()
+
+    input_file = Path(args.input)
+    output_dir = Path(args.output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    df = pd.read_csv(input_file, index_col=0)
+
+    calc_alpha_div(df, output_dir)
+    calc_beta_div(df, output_dir, args.depth)
+    print(
+        f"α & β-diversities have been successfully calculated and saved to '{output_dir}'."
+    )
+
+    pycache_dir = Path(__file__).resolve().parent / "__pycache__"
+    if pycache_dir.exists() and pycache_dir.is_dir():
+        shutil.rmtree(pycache_dir)
diff --git a/krakenparser/kraken2csv.sh b/krakenparser/kraken2csv.sh
@@ -62,7 +62,6 @@ for file in "$COUNTS_DIR"/txt/counts_*.txt; do
         echo "Error: Failed to process $file"
         exit 1
     fi
-    echo "Processed $file successfully."
 done
 
 # PART 5: CONVERT TXT FILES TO CSV
@@ -74,9 +73,18 @@ done
 
 # PART 6: CALCULATE RELATIVE ABUNDANCE
 
+mkdir -p "$PARENT_DIR/rel_abund"
+
 for file in "$COUNTS_DIR"/csv/counts_*.csv; do
-    CSV_RA_FILE="$COUNTS_DIR/csv_relabund/$(basename "$file")"
+    base=$(basename "$file")
+    new_name="${base/counts_/ra_}"
+    CSV_RA_FILE="$PARENT_DIR/rel_abund/$new_name"
     python "$SCRIPT_DIR/relabund.py" -i "$file" -o "$CSV_RA_FILE"
 done
 
+# PART 7: CALCULATE α & β-DIVERSITIES
+
+CSV_SPECIES_FILE="$COUNTS_DIR/csv/counts_species.csv"
+python "$SCRIPT_DIR/diversity.py" -i "$CSV_SPECIES_FILE" -o "$PARENT_DIR/diversity"
+
 echo "All steps completed successfully!"
diff --git a/krakenparser/krakenparser.py b/krakenparser/krakenparser.py
@@ -54,6 +54,11 @@ def main():
         action="store_true",
         help="Calculate relative abundance",
     )
+    parser.add_argument(
+        "--diversity",
+        action="store_true",
+        help="Calculate α & β-diversities",
+    )
     parser.add_argument(
         "-V", "--version", action="version", version=f"%(prog)s {__version__}"
     )
@@ -72,6 +77,7 @@ def main():
         "process": package_dir / "processing_script.py",
         "txt2csv": package_dir / "convert2csv.py",
         "relabund": package_dir / "relabund.py",
+        "diversity": package_dir / "diversity.py",
     }
 
     if "-h" in sys.argv or "--help" in sys.argv:

diff --git a/krakenparser/processing_script.py b/krakenparser/processing_script.py
@@ -35,6 +35,8 @@ def process_files(source_file, destination_file):
     with open(destination_file, "w") as file:
         file.write(updated_content)
 
+    print(f"Processed {destination_file} successfully.")
+
     # Get the path to the current directory (same location as the script)
     current_dir = Path(__file__).resolve().parent
     pycache_dir = current_dir / "__pycache__"

diff --git a/krakenparser/version.py b/krakenparser/version.py
@@ -1 +1 @@
-__version__ = "0.5.0"
+__version__ = "0.6.0"
diff --git a/tests/test_full_pipeline.py b/tests/test_full_pipeline.py
@@ -42,8 +42,8 @@ def test_full_pipeline_end_to_end(demo_run):
         assert csv_path.stat().st_size > 0, f"counts_{rank}.csv is empty"
 
     # 3. Assert relative‐abundance CSVs exist and are non‐empty
-    rel_dir = run_dir / "counts" / "csv_relabund"
-    assert rel_dir.exists(), "csv_relabund directory is missing"
-    rel_species = rel_dir / "counts_species.csv"
-    assert rel_species.exists(), "Missing counts_species.csv in csv_relabund"
-    assert rel_species.stat().st_size > 0, "counts_species.csv is empty"
+    rel_dir = run_dir / "rel_abund"
+    assert rel_dir.exists(), "rel_abund directory is missing"
+    rel_species = rel_dir / "ra_species.csv"
+    assert rel_species.exists(), "Missing ra_species.csv in csv_relabund"
+    assert rel_species.stat().st_size > 0, "ra_species.csv is empty"