Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gfalibs
171 changes: 171 additions & 0 deletions include/bam-utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
#ifndef BAMUTILS_H
#define BAMUTILS_H

#include <htslib/sam.h>

#include <string>
#include <vector>
#include <utility> // std::swap
#include <cstdio> // fprintf
#include <cstdlib> // std::abort
#include <cstdint> // uint16_t, etc

// Create a fresh unmapped BAM record (allocates a new bam1_t).
// NOTE: l_qname passed to bam_set1 must include the trailing '\0' => name.size() + 1.
static inline bam1_t* make_unmapped_bam(
const std::string& name,
const std::string& seq,
const std::string& qual_str,
uint16_t extra_flags = 0
) {
bam1_t* b = bam_init1();
if (!b) {
std::fprintf(stderr, "Failed to initialize bam1_t\n");
std::abort();
}

const int32_t l_qname = static_cast<int32_t>(name.size() + 1);
const int32_t l_seq = static_cast<int32_t>(seq.size());

const uint16_t flag = static_cast<uint16_t>(BAM_FUNMAP | extra_flags);

// We pass nullptr for qual here and then fill the numeric quals below.
// (Some htslib versions allow passing qual_str.c_str() directly; see fill_unmapped_bam)
if (bam_set1(
b,
l_qname,
name.c_str(),
flag,
-1, -1, // tid, pos
0, // mapq
0, // n_cigar
nullptr, // cigar
-1, -1, // mtid, mpos
0, // isize
l_seq,
seq.c_str(),
nullptr, // qual (filled below)
0 // l_aux
) < 0)
{
std::fprintf(stderr, "Failed to set BAM data\n");
std::abort();
}

// Fill numeric (0..93) qualities. bam_set1 stores quals as raw Phred, not ASCII.
uint8_t* qual = bam_get_qual(b);
if (qual) {
if (!qual_str.empty()) {
if (qual_str.size() != seq.size()) {
std::fprintf(stderr, "Sequence and quality length mismatch in make_unmapped_bam\n");
std::abort();
}
for (size_t i = 0; i < qual_str.size(); ++i) {
const int q = static_cast<int>(qual_str[i]) - 33;
qual[i] = static_cast<uint8_t>(q < 0 ? 0 : q);
}
} else {
// If no qualities provided, set to 0 (Phred 0).
for (int32_t i = 0; i < l_seq; ++i)
qual[i] = 0;
}
}

return b;
}

// Keep b->data allocated, just mark record empty (pool-friendly).
static inline void bam_recycle_keep_capacity(bam1_t* b) {
b->l_data = 0;
b->core = bam1_core_t(); // zero-initialize core
// Do NOT free b->data; m_data stays as-is.
}

// Fill an existing bam1_t as an unmapped record.
// Uses bam_set1 when available; otherwise falls back to making a temporary record and swapping.
static inline void fill_unmapped_bam(
bam1_t* b,
const std::string& name,
const std::string& seq,
const std::string& qual_str,
uint16_t extra_flags = 0
) {
#ifdef bam_set1
const int32_t l_qname = static_cast<int32_t>(name.size() + 1);
const int32_t l_seq = static_cast<int32_t>(seq.size());

const int ret = bam_set1(
b,
l_qname, name.c_str(),
static_cast<uint16_t>(BAM_FUNMAP | extra_flags),
-1, -1, 0, // tid, pos, mapq
0, nullptr, // n_cigar, cigar
-1, -1, 0, // mtid, mpos, isize
l_seq,
seq.c_str(),
qual_str.empty() ? nullptr : qual_str.c_str(),
0
);
if (ret < 0) {
std::fprintf(stderr, "bam_set1 failed\n");
std::abort();
}

// If qual_str was nullptr/empty, explicitly set qualities to 0.
if (qual_str.empty()) {
uint8_t* q = bam_get_qual(b);
if (q) {
for (int32_t i = 0; i < l_seq; ++i)
q[i] = 0;
}
}

#else
// Fallback: allocate a temporary record using existing helper,
// then swap contents into b (still avoids repeated malloc/free of b itself).
bam1_t* tmp = make_unmapped_bam(name, seq, qual_str, extra_flags);

std::swap(b->core, tmp->core);
std::swap(b->l_data, tmp->l_data);
std::swap(b->m_data, tmp->m_data);
std::swap(b->data, tmp->data);

bam_destroy1(tmp);
#endif
}

// ---- Templated helpers (no dependency on BamBatch typedef) ----
// Requirements on Batch:
// - batch.reads is std::vector<bam1_t*>
// - batch.used is an integer count of valid entries

template <typename Batch>
static inline void bambatch_ensure_capacity(Batch& batch, size_t additional) {
const size_t need = static_cast<size_t>(batch.used) + additional;
if (need <= batch.reads.size()) return;

const size_t old = batch.reads.size();
size_t neu = old ? old : 1024;
while (neu < need) neu *= 2;

batch.reads.resize(neu);
for (size_t i = old; i < neu; ++i)
batch.reads[i] = bam_init1();
}

template <typename Batch>
static inline void bambatch_append_unmapped(
Batch& batch,
const std::string& name,
const std::string& seq,
const std::string& qual,
uint16_t extra_flags
) {
bambatch_ensure_capacity(batch, 1);

bam1_t* b = batch.reads[batch.used++];
bam_recycle_keep_capacity(b);
fill_unmapped_bam(b, name, seq, qual, extra_flags);
}

#endif /* BAMUTILS_H */
49 changes: 2 additions & 47 deletions include/cifi.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define SCIFI_H

#include <htslib/sam.h>
#include "bam-utils.h"
#include "reads.h"

struct EnzymeInfo {
Expand All @@ -19,54 +20,8 @@ std::vector<int> find_cut_positions(const InRead& read, const EnzymeInfo& enz);
void digest_read(const InRead& read, const EnzymeInfo& enz, std::vector<std::string>& frag_seqs, std::vector<std::string>& frag_quals, std::vector<int>* frag_lengths = nullptr);

// Digest sequences and corresponding quality strings
BamBatch chop_read_to_bambatch(const InRead& read, const EnzymeInfo& enz, uint32_t batchN = 0, uint32_t fileN = 0);
void chop_read_into_bambatch(const InRead& read, const EnzymeInfo& enz, BamBatch& batch);

void build_pe_bambatches(const InRead& read, const EnzymeInfo& enz, BamBatch& R1_batch, BamBatch& R2_batch);

static bam1_t* make_unmapped_bam(const std::string& name, const std::string& seq, const std::string& qual_str, uint16_t extra_flags = 0) {
bam1_t* b = bam_init1();
if (!b) {
fprintf(stderr, "Failed to initialize bam1_t\n");
std::abort();
}

const int32_t l_qname = static_cast<int32_t>(name.size());
const int32_t l_seq = static_cast<int32_t>(seq.size());

uint16_t flag = BAM_FUNMAP | extra_flags;

if (bam_set1(b,
l_qname,
name.c_str(),
flag,
-1, -1,
0, 0,
nullptr,
-1, -1,
0,
l_seq,
seq.c_str(),
nullptr,
0) < 0) {
fprintf(stderr, "Failed to set BAM data\n");
std::abort();
}

uint8_t* qual = bam_get_qual(b);
if (qual && !qual_str.empty()) {
if (qual_str.size() != seq.size()) {
fprintf(stderr, "Sequence and quality length mismatch in make_unmapped_bam\n");
std::abort();
}
for (size_t i = 0; i < qual_str.size(); ++i) {
qual[i] = static_cast<uint8_t>(qual_str[i] - 33);
}
} else if (qual) {
for (int32_t i = 0; i < l_seq; ++i)
qual[i] = 0;
}

return b;
}

#endif /* SCIFI_H */
26 changes: 17 additions & 9 deletions include/reads.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ struct UserInputRdeval : UserInput {
// CiFi
bool inputCifi = false;
std::string restrictionEnzyme = "";

int bgzip_level = -1;
};


Expand All @@ -64,8 +66,6 @@ struct InRead {
uint64_t A=0, C=0, G=0, T=0, N=0, lowerCount=0;
uint32_t uId=0, iId=0, seqPos=0;

std::vector<Tag> tags;
std::vector<std::deque<DBGpath>> variants;
float avgQuality = 0.0f;

InRead() = default;
Expand All @@ -83,8 +83,7 @@ struct InRead {
uint32_t readPos,
uint64_t A, uint64_t C, uint64_t G, uint64_t T,
uint64_t N, uint64_t lowerCount,
float avgQuality,
const std::vector<Tag>* tags = nullptr)
float avgQuality)
: seqHeader(std::move(header)),
seqComment(std::move(comment)),
inSequence(std::move(sequence)),
Expand All @@ -93,7 +92,6 @@ struct InRead {
uId(uId), iId(iId), seqPos(readPos), avgQuality(avgQuality)
{
(void)threadLog;
if (tags) this->tags = *tags;
#ifdef DEBUG
if (threadLog) {
threadLog->add("Processing read: " + seqHeader +
Expand All @@ -109,16 +107,18 @@ struct InRead {
}
};


// -------------------------------------------------------------
// ReadBatch and container structures
// -------------------------------------------------------------

template<typename T>
struct ReadBatch {
std::vector<T> reads;
uint32_t used = 0; // how many valid entries this time
uint32_t batchN = 0; // within-file batch index
uint32_t fileN = 0; // file ID

void reset_keep_capacity() { used = 0; } // DO NOT clear()
};

template<typename T>
Expand All @@ -131,7 +131,15 @@ struct FileBatches {
std::vector<ReadBatches<T>> files;
};

using BamBatch = ReadBatch<bam1_t*>;
struct BamBatch : ReadBatch<bam1_t*> {
~BamBatch() {
for (bam1_t* b : this->reads) {
if (b) bam_destroy1(b);
}
this->reads.clear();
this->used = 0;
}
};
using InReadBatch = ReadBatch<InRead>;

// -------------------------------------------------------------
Expand Down Expand Up @@ -196,7 +204,7 @@ class InReads {
size_t outBuffersN = 0;

// Input queues
BlockingQueue<std::unique_ptr<Sequences2>> free_pool_in;
ElasticBlockingPool<Sequences2> free_pool_in;
BlockingQueue<std::unique_ptr<Sequences2>> filled_q_in;

// Output queues (BAM batches)
Expand All @@ -222,7 +230,7 @@ class InReads {
inline bool applyFilter(uint64_t size, float avgQuality);

// Processing
float computeAvgQuality(std::string& sequenceQuality);
float computeAvgQuality(const std::string& sequenceQuality);
void filterRecords();
void extractInReads();
bool traverseInReads(Sequences2& readBatch);
Expand Down
Loading
Loading