Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions cpp/src/bam/BamReader.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/***************************************************************************************************
*
* Copyright (C) 2025 Genome4me Incorporated - All Rights Reserved.
* Copyright (C) 2025-2026 Genome4me Incorporated - All Rights Reserved.
*
* This software, including its source code, embedded concepts, and associated
* documentation, is proprietary to Genome4me Incorporated and is protected
Expand All @@ -22,10 +22,10 @@
#include <utility>

namespace deeprm {
BamReader::BamReader(const string& path, int bq_threshold, char boi,
BamReader::BamReader(const string& path, int bq_threshold, char boi, int filter_flag,
unordered_map<string, int>& ref_index_dict)
: bam_path(path), bq_cutoff(bq_threshold), base_of_interest(boi),
ref_index_dict(ref_index_dict)
filter_flag(filter_flag), ref_index_dict(ref_index_dict)
{
}

Expand Down Expand Up @@ -370,8 +370,21 @@ namespace deeprm {

void BamReader::process_read(bam1_t* read, vector<BamRecord>& records)
{
// BAM_FUNMAP and l_qseq==0 are unconditional safety checks: downstream
// code dereferences reference and sequence, so an unmapped record (tid
// == -1, no CIGAR) or a sequence-less secondary alignment (the BAM
// standard stores secondaries with seq='*', i.e. l_qseq==0) would
// crash. The default `-g 276` happens to mask both via its 4 (UNMAP)
// and 256 (SECONDARY) bits, but any user-supplied -g value missing
// them — e.g. `-g 0`, `-g 4`, `-g 16` — would let the unsafe records
// through. These two checks make the path safe for any -g value.
//
// filter_flag is then the user-tunable SAM-flag mask, kept identical to
// MergedDataWorker's no-C path so that -C and no-C produce the same
// record set for any -g value.
if (read->core.flag & BAM_FUNMAP) return;
if (read->core.l_qseq == 0) return;
if (read->core.flag & filter_flag) return;

// Check for mv tag
uint8_t* mv_tag = bam_aux_get(read, "mv");
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/bam/BamReader.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/***************************************************************************************************
*
* Copyright (C) 2025 Genome4me Incorporated - All Rights Reserved.
* Copyright (C) 2025-2026 Genome4me Incorporated - All Rights Reserved.
*
* This software, including its source code, embedded concepts, and associated
* documentation, is proprietary to Genome4me Incorporated and is protected
Expand Down Expand Up @@ -44,6 +44,7 @@ namespace deeprm {
string bam_path;
int bq_cutoff;
char base_of_interest;
int filter_flag; // SAM flag bits to exclude (matches MergedDataWorker no-C path)
unordered_map<string, int> ref_index_dict;

vector<pair<int32_t, int32_t>> get_aligned_pairs(bam1_t* read, char boi);
Expand All @@ -52,7 +53,7 @@ namespace deeprm {
uint32_t get_md_reference_length(const char* md_tag);

public:
BamReader(const string& path, int bq_threshold, char boi,
BamReader(const string& path, int bq_threshold, char boi, int filter_flag,
unordered_map<string, int>& ref_index_dict);
~BamReader();

Expand Down
3 changes: 2 additions & 1 deletion cpp/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,8 @@ void process_bam_worker(int worker_id, int num_workers, const Arguments& args,
{
log_info() << "Starting BAM worker " << worker_id << endl;

BamReader reader(args.bam_path, args.qcut, args.base_of_interest, ref_index_dict);
BamReader reader(args.bam_path, args.qcut, args.base_of_interest,
args.filter_flag, ref_index_dict);
auto records = reader.parse_bam(worker_id, num_workers, args.bam_threads);

size_t record_count = records.size(); // Save size before move
Expand Down
14 changes: 13 additions & 1 deletion cpp/src/merger/MergedDataWorker.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/***************************************************************************************************
*
* Copyright (C) 2025 Genome4me Incorporated - All Rights Reserved.
* Copyright (C) 2025-2026 Genome4me Incorporated - All Rights Reserved.
*
* This software, including its source code, embedded concepts, and associated
* documentation, is proprietary to Genome4me Incorporated and is protected
Expand Down Expand Up @@ -222,6 +222,18 @@ namespace deeprm {
bool MergedDataWorker::convert_bam1_to_record(bam1_t* read, sam_hdr_t* header,
BamRecord& record) const
{
// BAM_FUNMAP and l_qseq==0 are unconditional safety checks: downstream
// code dereferences reference and sequence, so an unmapped record (tid
// == -1, no CIGAR) or a sequence-less secondary alignment (the BAM
// standard stores secondaries with seq='*', i.e. l_qseq==0) would
// crash. The default `-g 276` happens to mask both via its 4 (UNMAP)
// and 256 (SECONDARY) bits, but any user-supplied -g value missing
// them — e.g. `-g 0`, `-g 4`, `-g 16` — would let the unsafe records
// through. These two checks make the path safe for any -g value.
if (read->core.flag & BAM_FUNMAP)
return false;
if (read->core.l_qseq == 0)
return false;
if (read->core.flag & args.filter_flag)
return false;

Expand Down
Loading
Loading