@@ -67,6 +67,111 @@ impl GeneIdInterner {
6767 }
6868}
6969
70+ // ===================================================================
71+ // BAM header validation
72+ // ===================================================================
73+
74+ /// Known duplicate-marking tool identifiers.
75+ ///
76+ /// These strings are matched (case-insensitively) against the `ID:` and `PN:`
77+ /// (program name) fields of `@PG` header entries. Matching is restricted to these
78+ /// fields to avoid false positives from command-line strings or unrelated tools
79+ /// (e.g., `picard SortSam` or `sambamba sort`).
80+ const KNOWN_DUP_MARKERS : & [ & str ] = & [
81+ "markduplicates" ,
82+ "samblaster" ,
83+ "sambamba markdup" ,
84+ "sambamba_markdup" ,
85+ "biobambam" ,
86+ "estreamer" ,
87+ "fgbio" ,
88+ "umis" ,
89+ "umi_tools" ,
90+ "umi-tools" ,
91+ "gencore" ,
92+ "gatk markduplicates" ,
93+ "sentieon dedup" ,
94+ ] ;
95+
96+ /// Extracts the value of a specific tag from a SAM header line.
97+ ///
98+ /// For example, given the line `@PG\tID:MarkDuplicates\tPN:MarkDuplicates` and
99+ /// the tag `"ID"`, returns `Some("MarkDuplicates")`.
100+ fn extract_header_tag < ' a > ( line : & ' a str , tag : & str ) -> Option < & ' a str > {
101+ let prefix = format ! ( "{}:" , tag) ;
102+ line. split ( '\t' )
103+ . find ( |field| field. starts_with ( & prefix) )
104+ . map ( |field| & field[ prefix. len ( ) ..] )
105+ }
106+
107+ /// Checks whether a SAM header text contains evidence of a duplicate-marking tool.
108+ ///
109+ /// Only the `ID:` and `PN:` fields of `@PG` lines are inspected to avoid false
110+ /// positives from command-line arguments or unrelated tools that happen to share
111+ /// a name (e.g., `picard SortSam`).
112+ ///
113+ /// Returns `true` if a known duplicate-marking tool is found.
114+ fn header_has_dup_marker ( header_text : & str ) -> bool {
115+ for line in header_text. lines ( ) {
116+ // Only inspect @PG (program) header lines
117+ if !line. starts_with ( "@PG" ) {
118+ continue ;
119+ }
120+
121+ // Extract only the ID and PN fields for matching
122+ let id = extract_header_tag ( line, "ID" ) . unwrap_or ( "" ) ;
123+ let pn = extract_header_tag ( line, "PN" ) . unwrap_or ( "" ) ;
124+ let id_lower = id. to_lowercase ( ) ;
125+ let pn_lower = pn. to_lowercase ( ) ;
126+
127+ if KNOWN_DUP_MARKERS
128+ . iter ( )
129+ . any ( |marker| id_lower. contains ( marker) || pn_lower. contains ( marker) )
130+ {
131+ debug ! ( "Found duplicate-marking tool in @PG header: {}" , line) ;
132+ return true ;
133+ }
134+ }
135+ false
136+ }
137+
138+ /// Checks the BAM `@PG` header lines for evidence that a duplicate-marking tool has been run.
139+ ///
140+ /// Returns `Ok(())` if a known duplicate-marking tool is found, or an error with
141+ /// a descriptive message if none is detected.
142+ ///
143+ /// # Arguments
144+ ///
145+ /// * `header` - The BAM header view to inspect
146+ /// * `bam_path` - The BAM file path (used in the error message)
147+ fn verify_duplicates_marked ( header : & bam:: HeaderView , bam_path : & str ) -> Result < ( ) > {
148+ let header_text = String :: from_utf8_lossy ( header. as_bytes ( ) ) ;
149+ if header_has_dup_marker ( & header_text) {
150+ return Ok ( ( ) ) ;
151+ }
152+
153+ anyhow:: bail!(
154+ "No duplicate-marking tool found in BAM header of '{}'.\n \
155+ \n \
156+ RustQC requires that BAM files have duplicates marked (SAM flag 0x400)\n \
157+ but NOT removed. The BAM @PG header lines do not contain evidence of a\n \
158+ known duplicate-marking tool.\n \
159+ \n \
160+ Please run one of the following tools before using RustQC:\n \
161+ \n \
162+ - Picard MarkDuplicates: picard MarkDuplicates I=input.bam O=marked.bam M=metrics.txt\n \
163+ - samblaster: samtools view -h input.bam | samblaster | samtools view -bS - > marked.bam\n \
164+ - sambamba markdup: sambamba markdup input.bam marked.bam\n \
165+ \n \
166+ If you are certain that duplicates are already marked, use --skip-dup-check to bypass this check.",
167+ bam_path
168+ )
169+ }
170+
171+ // ===================================================================
172+ // BAM flag constants
173+ // ===================================================================
174+
70175/// Flag indicating the read is a PCR or optical duplicate (0x400).
71176const BAM_FDUP : u16 = 0x400 ;
72177/// Flag indicating the read is unmapped (0x4).
@@ -681,6 +786,8 @@ fn process_chromosome_batch(
681786/// * `stranded` - Library strandedness (0, 1, or 2)
682787/// * `paired` - Whether the library is paired-end
683788/// * `threads` - Number of threads for BAM processing
789+ /// * `skip_dup_check` - If true, skip the BAM header check for duplicate-marking tools
790+ #[ allow( clippy:: too_many_arguments) ]
684791pub fn count_reads (
685792 bam_path : & str ,
686793 genes : & IndexMap < String , Gene > ,
@@ -689,18 +796,28 @@ pub fn count_reads(
689796 threads : usize ,
690797 chrom_mapping : & HashMap < String , String > ,
691798 chrom_prefix : Option < & str > ,
799+ skip_dup_check : bool ,
692800) -> Result < CountResult > {
693801 // Build gene ID interner for allocation-free lookups in the hot loop
694802 let interner = GeneIdInterner :: from_genes ( genes) ;
695803
696804 // Build spatial index (uses interned gene indices)
697805 let index = build_index ( genes, & interner) ;
698806
699- // Get chromosome names from header using a temporary reader
807+ // Get chromosome names from header using a temporary reader,
808+ // and verify that duplicates have been marked in the BAM file.
700809 let tid_to_name: Vec < String > = {
701810 let bam = bam:: Reader :: from_path ( bam_path)
702811 . with_context ( || format ! ( "Failed to open BAM file: {}" , bam_path) ) ?;
703812 let header = bam. header ( ) . clone ( ) ;
813+
814+ // Check for evidence of duplicate-marking in @PG header lines
815+ if skip_dup_check {
816+ info ! ( "Skipping duplicate-marking verification (--skip-dup-check)" ) ;
817+ } else {
818+ verify_duplicates_marked ( & header, bam_path) ?;
819+ }
820+
704821 ( 0 ..header. target_count ( ) )
705822 . map ( |tid| String :: from_utf8_lossy ( header. tid2name ( tid) ) . to_string ( ) )
706823 . collect ( )
@@ -1108,6 +1225,27 @@ pub fn count_reads(
11081225 merged. total_fragments
11091226 ) ;
11101227
1228+ // Verify that at least some reads were flagged as duplicates.
1229+ // Even if the @PG header check passed (or was skipped), it's possible that
1230+ // duplicates were not actually marked. Warn the user in that case.
1231+ if merged. total_dup == 0 && merged. total_mapped > 0 && !skip_dup_check {
1232+ anyhow:: bail!(
1233+ "No duplicate-flagged reads found among {} mapped reads in '{}'.\n \
1234+ \n \
1235+ Although the BAM header suggests a duplicate-marking tool was run,\n \
1236+ no reads have the duplicate flag (0x400) set. This likely means\n \
1237+ duplicates were removed rather than marked, or the tool did not\n \
1238+ flag any duplicates.\n \
1239+ \n \
1240+ RustQC requires duplicates to be marked (flagged) but NOT removed.\n \
1241+ Please re-run your duplicate-marking tool without removing duplicates.\n \
1242+ \n \
1243+ If this is expected, use --skip-dup-check to bypass this check.",
1244+ merged. total_mapped,
1245+ bam_path
1246+ ) ;
1247+ }
1248+
11111249 // Detect chromosome name mismatch
11121250 let genes_with_reads = merged
11131251 . gene_counts
@@ -1206,4 +1344,117 @@ mod tests {
12061344 // Read2 on - strand: effective + -> matches + gene
12071345 assert ! ( strand_matches( true , false , true , '+' , 1 ) ) ;
12081346 }
1347+
1348+ // ---------------------------------------------------------------
1349+ // Duplicate marking validation tests
1350+ // ---------------------------------------------------------------
1351+
1352+ #[ test]
1353+ fn test_extract_header_tag ( ) {
1354+ let line = "@PG\t ID:MarkDuplicates\t PN:MarkDuplicates\t VN:2.27.4\t CL:picard MarkDuplicates I=in.bam O=out.bam" ;
1355+ assert_eq ! ( extract_header_tag( line, "ID" ) , Some ( "MarkDuplicates" ) ) ;
1356+ assert_eq ! ( extract_header_tag( line, "PN" ) , Some ( "MarkDuplicates" ) ) ;
1357+ assert_eq ! ( extract_header_tag( line, "VN" ) , Some ( "2.27.4" ) ) ;
1358+ assert_eq ! (
1359+ extract_header_tag( line, "CL" ) ,
1360+ Some ( "picard MarkDuplicates I=in.bam O=out.bam" )
1361+ ) ;
1362+ assert_eq ! ( extract_header_tag( line, "XX" ) , None ) ;
1363+ }
1364+
1365+ #[ test]
1366+ fn test_dup_check_picard_markduplicates ( ) {
1367+ let header = "@HD\t VN:1.6\t SO:coordinate\n \
1368+ @PG\t ID:MarkDuplicates\t PN:MarkDuplicates\t VN:2.27.4";
1369+ assert ! ( header_has_dup_marker( header) ) ;
1370+ }
1371+
1372+ #[ test]
1373+ fn test_dup_check_samblaster ( ) {
1374+ let header = "@HD\t VN:1.6\n \
1375+ @PG\t ID:samblaster\t PN:samblaster\t VN:0.1.26";
1376+ assert ! ( header_has_dup_marker( header) ) ;
1377+ }
1378+
1379+ #[ test]
1380+ fn test_dup_check_sambamba_markdup ( ) {
1381+ let header = "@HD\t VN:1.6\n \
1382+ @PG\t ID:sambamba_markdup\t PN:sambamba markdup";
1383+ assert ! ( header_has_dup_marker( header) ) ;
1384+ }
1385+
1386+ #[ test]
1387+ fn test_dup_check_biobambam ( ) {
1388+ let header = "@HD\t VN:1.6\n \
1389+ @PG\t ID:biobambam2\t PN:biobambam";
1390+ assert ! ( header_has_dup_marker( header) ) ;
1391+ }
1392+
1393+ #[ test]
1394+ fn test_dup_check_case_insensitive ( ) {
1395+ // MarkDuplicates with different casing
1396+ let header = "@PG\t ID:MARKDUPLICATES\t PN:markduplicates" ;
1397+ assert ! ( header_has_dup_marker( header) ) ;
1398+ }
1399+
1400+ #[ test]
1401+ fn test_dup_check_no_dup_marker ( ) {
1402+ // Header with only alignment tools, no dup marker
1403+ let header = "@HD\t VN:1.6\t SO:coordinate\n \
1404+ @PG\t ID:bwa\t PN:bwa\t VN:0.7.17\n \
1405+ @PG\t ID:samtools\t PN:samtools\t VN:1.17";
1406+ assert ! ( !header_has_dup_marker( header) ) ;
1407+ }
1408+
1409+ #[ test]
1410+ fn test_dup_check_empty_header ( ) {
1411+ assert ! ( !header_has_dup_marker( "" ) ) ;
1412+ assert ! ( !header_has_dup_marker( "@HD\t VN:1.6\t SO:coordinate" ) ) ;
1413+ }
1414+
1415+ #[ test]
1416+ fn test_dup_check_picard_sortsam_no_false_positive ( ) {
1417+ // Picard SortSam should NOT match — only the CL field mentions "picard"
1418+ let header = "@PG\t ID:SortSam\t PN:SortSam\t CL:picard SortSam I=in.bam O=out.bam" ;
1419+ assert ! ( !header_has_dup_marker( header) ) ;
1420+ }
1421+
1422+ #[ test]
1423+ fn test_dup_check_picard_collect_metrics_no_false_positive ( ) {
1424+ // Another Picard tool that should not match
1425+ let header =
1426+ "@PG\t ID:CollectInsertSizeMetrics\t PN:CollectInsertSizeMetrics\t CL:picard CollectInsertSizeMetrics" ;
1427+ assert ! ( !header_has_dup_marker( header) ) ;
1428+ }
1429+
1430+ #[ test]
1431+ fn test_dup_check_sambamba_sort_no_false_positive ( ) {
1432+ // sambamba sort should NOT match
1433+ let header = "@PG\t ID:sambamba\t PN:sambamba sort" ;
1434+ assert ! ( !header_has_dup_marker( header) ) ;
1435+ }
1436+
1437+ #[ test]
1438+ fn test_dup_check_multiple_pg_lines ( ) {
1439+ // MarkDuplicates appears as a later @PG entry in the chain
1440+ let header = "@HD\t VN:1.6\t SO:coordinate\n \
1441+ @PG\t ID:bwa\t PN:bwa\t VN:0.7.17\n \
1442+ @PG\t ID:samtools\t PN:samtools\t VN:1.17\n \
1443+ @PG\t ID:MarkDuplicates\t PN:MarkDuplicates\t PP:samtools";
1444+ assert ! ( header_has_dup_marker( header) ) ;
1445+ }
1446+
1447+ #[ test]
1448+ fn test_dup_check_gatk_markduplicates ( ) {
1449+ let header = "@PG\t ID:GATK MarkDuplicates\t PN:GATK MarkDuplicates\t VN:4.4.0" ;
1450+ assert ! ( header_has_dup_marker( header) ) ;
1451+ }
1452+
1453+ #[ test]
1454+ fn test_dup_check_dup_marker_only_in_cl_no_match ( ) {
1455+ // A tool that mentions MarkDuplicates only in the CL field should not match
1456+ // (e.g., a wrapper script that calls MarkDuplicates but has its own ID/PN)
1457+ let header = "@PG\t ID:my_pipeline\t PN:my_pipeline\t CL:java -jar picard.jar MarkDuplicates" ;
1458+ assert ! ( !header_has_dup_marker( header) ) ;
1459+ }
12091460}
0 commit comments