diff --git a/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java b/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java index d8c1dd9568..a786bf4330 100644 --- a/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java +++ b/src/main/java/htsjdk/samtools/CRAMContainerStreamWriter.java @@ -89,6 +89,7 @@ public CRAMContainerStreamWriter( this.cramIndexer = indexer; this.outputStreamIdentifier = outputIdentifier; this.containerFactory = new ContainerFactory(samFileHeader, encodingStrategy, referenceSource); + samFileHeader.getSequenceDictionary().requireMD5sOrThrow(outputIdentifier); } /** diff --git a/src/main/java/htsjdk/samtools/SAMFileWriterFactory.java b/src/main/java/htsjdk/samtools/SAMFileWriterFactory.java index b9bd1dcb14..b3dad09a58 100644 --- a/src/main/java/htsjdk/samtools/SAMFileWriterFactory.java +++ b/src/main/java/htsjdk/samtools/SAMFileWriterFactory.java @@ -23,9 +23,10 @@ */ package htsjdk.samtools; +import htsjdk.io.HtsPath; +import htsjdk.io.IOPath; import htsjdk.samtools.cram.ref.CRAMReferenceSource; import htsjdk.samtools.cram.ref.ReferenceSource; -import htsjdk.samtools.cram.structure.CRAMEncodingStrategy; import htsjdk.samtools.util.BlockCompressedOutputStream; import htsjdk.samtools.util.FileExtensions; import htsjdk.samtools.util.IOUtil; @@ -36,9 +37,11 @@ import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; +import java.util.List; import java.util.zip.Deflater; import static htsjdk.samtools.SamReader.Type.*; @@ -634,6 +637,7 @@ private CRAMFileWriter createCRAMWriterWithSettings( final Path referenceFasta) { final CRAMReferenceSource referenceSource; + SAMFileHeader repairedHeader = header; if (referenceFasta == null) { log.info("Reference fasta is not provided when writing CRAM file " + outputFile.toUri().toString()); log.info("Will attempt to use a default reference or download as set by defaults:"); @@ -642,6 +646,10 @@ private CRAMFileWriter createCRAMWriterWithSettings( referenceSource = ReferenceSource.getDefaultCRAMReferenceSource(); } else { + // CRAMs are required to have a SAMHeader with a sequence dictionary that contains MD5 + // checksums for each sequence. Check the proposed dictionary to see if it has MD5s, and + // if not, attempt to update it with MD5s from the reference dictionary, if one exists. + repairedHeader = repairSequenceMD5s(header, new HtsPath(referenceFasta.toUri().toString())); referenceSource = new ReferenceSource(referenceFasta); } OutputStream cramOS = null; @@ -674,7 +682,7 @@ private CRAMFileWriter createCRAMWriterWithSettings( indexOS, presorted, referenceSource, - header, + repairedHeader, outputFile.toUri().toString()); setCRAMWriterDefaults(writer); @@ -687,6 +695,35 @@ private void setCRAMWriterDefaults(final CRAMFileWriter writer) { //writer.setEncodingParams(new CRAMEncodingStrategy()); } + // If any sequence records don't contain MD5s, attempt to repair them by consulting the reference's + // dictionary file, if one exists. + private SAMFileHeader repairSequenceMD5s(final SAMFileHeader header, final IOPath referenceFasta) { + final List missingMD5s = header.getSequenceDictionary().getSequencesWithMissingMD5s(); + if (!missingMD5s.isEmpty()) { + final String missingMD5Message = SAMSequenceDictionary.createFormattedMD5Message("SAM header", missingMD5s); + log.warn(String.format( + "%s Attempting to use the reference dictionary to repair missing MD5s required for CRAM output.", + missingMD5Message)); + final IOPath referenceDictionary = SAMSequenceDictionary.getFastaDictionaryFileName(referenceFasta); + try (final InputStream fastaDictionaryStream = referenceDictionary.getInputStream()) { + final SAMSequenceDictionary newDictionary = + SAMSequenceDictionary.loadSAMSequenceDictionary(fastaDictionaryStream); + newDictionary.requireMD5sOrThrow(referenceFasta.toString()); + final SAMFileHeader repairedHeader = header.clone(); + repairedHeader.setSequenceDictionary(newDictionary); + return repairedHeader; + } catch (final IOException | RuntimeException e) { + throw new RuntimeException( + String.format( + "The attempt to repair the missing sequence MD5s (%s) using the reference dictionary %s failed.", + missingMD5Message, + referenceDictionary), + e); + } + } + return header; + } + @Override public String toString() { return "SAMFileWriterFactory [createIndex=" + createIndex + ", createMd5File=" + createMd5File + ", useAsyncIo=" diff --git a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java index cf40fe6532..7243d01499 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceDictionary.java @@ -24,8 +24,15 @@ package htsjdk.samtools; import htsjdk.beta.plugin.HtsHeader; +import htsjdk.io.HtsPath; +import htsjdk.io.IOPath; +import htsjdk.samtools.util.BufferedLineReader; +import htsjdk.samtools.util.FileExtensions; import htsjdk.samtools.util.Log; +import htsjdk.samtools.util.RuntimeIOException; +import java.io.IOException; +import java.io.InputStream; import java.io.Serializable; import java.math.BigInteger; import java.security.MessageDigest; @@ -294,6 +301,99 @@ public String md5() { } } + + /** + * Given a fasta filename, return the name of the corresponding dictionary file. + */ + public static IOPath getFastaDictionaryFileName(IOPath fastaFile) { + final String fastaName = fastaFile.getURIString(); + int lastDot = fastaName.lastIndexOf('.'); + return new HtsPath(fastaName.substring(0, lastDot) + FileExtensions.DICT); + } + + /** + * Given a fasta dictionary file, returns its sequence dictionary + * + * @param fastaDictionary fasta dictionary file + * @return the SAMSequenceDictionary from fastaDictionaryFile + */ + public static SAMSequenceDictionary loadSAMSequenceDictionary( final IOPath fastaDictionary ) { + try ( final InputStream fastaDictionaryStream = fastaDictionary.getInputStream() ) { + return loadSAMSequenceDictionary(fastaDictionaryStream); + } + catch ( IOException e ) { + throw new RuntimeIOException("Error loading fasta dictionary file " + fastaDictionary, e); + } + } + + /** + * Given an InputStream connected to a fasta dictionary, returns its sequence dictionary + * + * Note: does not close the InputStream it's passed + * + * @param fastaDictionaryStream InputStream connected to a fasta dictionary + * @return the SAMSequenceDictionary from the fastaDictionaryStream + */ + public static SAMSequenceDictionary loadSAMSequenceDictionary( final InputStream fastaDictionaryStream ) { + // Don't close the reader when we're done, since we don't want to close the client's InputStream for them + final BufferedLineReader reader = new BufferedLineReader(fastaDictionaryStream); + + final SAMTextHeaderCodec codec = new SAMTextHeaderCodec(); + final SAMFileHeader header = codec.decode(reader, fastaDictionaryStream.toString()); + + // Make sure we have a valid sequence dictionary before continuing: + if (header.getSequenceDictionary() == null || header.getSequenceDictionary().isEmpty()) { + throw new RuntimeException ( + "Could not read sequence dictionary from given fasta stream " + + fastaDictionaryStream + ); + } + + return header.getSequenceDictionary(); + } + + final void requireMD5sOrThrow(final String contextMessage) { + final List missingMD5s = getSequencesWithMissingMD5s(); + if (!missingMD5s.isEmpty()) { + throw new RuntimeException(SAMSequenceDictionary.createFormattedMD5Message(contextMessage, missingMD5s)); + } + } + + /** + * @return all sequences in this dictionary that do not have an MD5 attribute + */ + List getSequencesWithMissingMD5s() { + return getSequences().stream().filter( + seqRec -> { + final String MD5 = seqRec.getMd5(); + return MD5 == null || MD5.length() == 0; + }).collect(Collectors.toList()); + } + + /** + * Create a formatted error string message describing which MD5s are missing from the sequence dictionary. + * + * @param contextID a string recognizable to the user describing the input that is the source of the failure + * @param badSequenceRecords the sequence with missing MD5s + * @return a message string suitable for presentation to the user + */ + static String createFormattedMD5Message(final String contextID, List badSequenceRecords) { + final int MAX_ERRORS_REPORTED = 10; + if (badSequenceRecords.size() != 0) { + return String.format( + "The sequence dictionary for %s is missing the required MD5 checksum for some contigs: %s%s.", + contextID, + badSequenceRecords.stream() + .limit(Integer.min(badSequenceRecords.size(), MAX_ERRORS_REPORTED)) + .map(SAMSequenceRecord::getSequenceName) + .collect(Collectors.joining(",")), + badSequenceRecords.size() > MAX_ERRORS_REPORTED ? + " and others": + ""); + } + return null; + } + @Override public int hashCode() { return mSequences.hashCode(); diff --git a/src/main/java/htsjdk/samtools/SAMSequenceRecord.java b/src/main/java/htsjdk/samtools/SAMSequenceRecord.java index 227f704f48..f2517ae009 100644 --- a/src/main/java/htsjdk/samtools/SAMSequenceRecord.java +++ b/src/main/java/htsjdk/samtools/SAMSequenceRecord.java @@ -24,16 +24,16 @@ package htsjdk.samtools; import htsjdk.samtools.util.Locatable; -import htsjdk.samtools.util.StringUtil; +import htsjdk.samtools.util.SequenceUtil; import java.math.BigInteger; -import java.util.ArrayList; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashSet; import java.util.LinkedHashSet; -import java.util.List; import java.util.Map; import java.util.Set; import java.util.regex.Pattern; @@ -148,6 +148,23 @@ public SAMSequenceRecord setMd5(final String value) { return this; } + /** + * Set the M5 attribute for this sequence using the provided sequenceBases. + * @param sequenceBases sequence bases to use to compute the MD5 for this sequence + * @return the update sequence record + */ + public SAMSequenceRecord setComputedMd5(final byte[] sequenceBases) { + try { + final MessageDigest md5Digester = MessageDigest.getInstance("MD5"); + if (md5Digester != null) { + setMd5(SequenceUtil.md5DigestToString(md5Digester.digest())); + } + } catch (final NoSuchAlgorithmException e) { + throw new RuntimeException("Error getting MD5 algorithm for sequence record MD5 computation", e); + } + return this; + } + public String getDescription() { return getAttribute(DESCRIPTION_TAG); } diff --git a/src/test/java/htsjdk/samtools/CRAMBAIIndexerTest.java b/src/test/java/htsjdk/samtools/CRAMBAIIndexerTest.java index 133c8edd89..1f7a3e860a 100644 --- a/src/test/java/htsjdk/samtools/CRAMBAIIndexerTest.java +++ b/src/test/java/htsjdk/samtools/CRAMBAIIndexerTest.java @@ -140,11 +140,6 @@ private void testMultipleContainerStream() throws IOException { final int refId1 = 0; final int refId2 = 1; - // for each ref, we alternate unmapped-placed with mapped - - final int expectedMapped = 1; - final int expectedUnmappedPlaced = 2; - try (final ByteArrayOutputStream contentStream = new ByteArrayOutputStream(); final ByteArrayOutputStream indexStream = new ByteArrayOutputStream()) { final CRAMContainerStreamWriter cramContainerStreamWriter = new CRAMContainerStreamWriter( diff --git a/src/test/java/htsjdk/samtools/CRAMCRAIIndexerTest.java b/src/test/java/htsjdk/samtools/CRAMCRAIIndexerTest.java index 4ded9ea581..6009b30acf 100644 --- a/src/test/java/htsjdk/samtools/CRAMCRAIIndexerTest.java +++ b/src/test/java/htsjdk/samtools/CRAMCRAIIndexerTest.java @@ -43,13 +43,13 @@ private void testCRAIIndexer(Index index) throws IOException { @Test public void testMultiRefContainer() throws IOException { - final SAMFileHeader samFileHeader = new SAMFileHeader(); + SAMFileHeader samFileHeader = new SAMFileHeader(); samFileHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate); samFileHeader.addSequence(new SAMSequenceRecord("1", 10)); samFileHeader.addSequence(new SAMSequenceRecord("2", 10)); samFileHeader.addSequence(new SAMSequenceRecord("3", 10)); - + samFileHeader = CRAMTestUtils.addFakeSequenceMD5s(samFileHeader); final ReferenceSource source = new ReferenceSource(new FakeReferenceSequenceFile(samFileHeader.getSequenceDictionary().getSequences())); byte[] cramBytes; diff --git a/src/test/java/htsjdk/samtools/CRAMComplianceTest.java b/src/test/java/htsjdk/samtools/CRAMComplianceTest.java index 790a72e5cd..70063227f5 100644 --- a/src/test/java/htsjdk/samtools/CRAMComplianceTest.java +++ b/src/test/java/htsjdk/samtools/CRAMComplianceTest.java @@ -179,7 +179,7 @@ private void doComplianceTest( // write them to a cram stream final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final ReferenceSource source = new ReferenceSource(t.refFile); - final CRAMFileWriter cramFileWriter = new CRAMFileWriter(baos, source, samFileHeader, name); + final CRAMFileWriter cramFileWriter = new CRAMFileWriter(baos, source, CRAMTestUtils.addFakeSequenceMD5s(samFileHeader), name); for (SAMRecord samRecord : samRecords) { cramFileWriter.addAlignment(samRecord); } @@ -331,7 +331,7 @@ public void testBAMThroughCRAMRoundTrip(final String testFileName, final String final File tempCRAMFile = File.createTempFile("testBAMThroughCRAMRoundTrip", FileExtensions.CRAM); tempCRAMFile.deleteOnExit(); SAMFileHeader samHeader = getFileHeader(originalBAMInputFile, referenceFile); - writeRecordsToFile(originalBAMRecords, tempCRAMFile, referenceFile, samHeader); + writeRecordsToFile(originalBAMRecords, tempCRAMFile, referenceFile, CRAMTestUtils.addFakeSequenceMD5s(samHeader)); // read the CRAM records back in and compare to the original BAM records List cramRecords = getSAMRecordsFromFile(tempCRAMFile, referenceFile); @@ -359,7 +359,7 @@ public void testBAMThroughCRAMRoundTripViaPath() throws IOException { try (FileSystem jimfs = Jimfs.newFileSystem(Configuration.unix())) { final Path tempCRAM = jimfs.getPath("testBAMThroughCRAMRoundTrip" + FileExtensions.CRAM); SAMFileHeader samHeader = getFileHeader(originalBAMInputFile, referenceFile); - writeRecordsToPath(originalBAMRecords, tempCRAM, referenceFile, samHeader); + writeRecordsToPath(originalBAMRecords, tempCRAM, referenceFile, CRAMTestUtils.addFakeSequenceMD5s(samHeader)); // read the CRAM records back in and compare to the original BAM records List cramRecords = getSAMRecordsFromPath(tempCRAM, referenceFile); diff --git a/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java b/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java index db76a29c04..fe8dd6aaf0 100644 --- a/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java +++ b/src/test/java/htsjdk/samtools/CRAMContainerStreamWriterTest.java @@ -58,7 +58,7 @@ private ReferenceSource createReferenceSource() { } private void doTest(final List samRecords, final ByteArrayOutputStream outStream, final OutputStream indexStream) { - final SAMFileHeader header = createSAMHeader(SAMFileHeader.SortOrder.coordinate); + final SAMFileHeader header = CRAMTestUtils.addFakeSequenceMD5s(createSAMHeader(SAMFileHeader.SortOrder.coordinate)); final ReferenceSource refSource = createReferenceSource(); final CRAMContainerStreamWriter containerStream = new CRAMContainerStreamWriter(outStream, indexStream, refSource, header, "test"); @@ -102,7 +102,7 @@ public void testCRAMContainerStreamNoIndex() { @Test(description = "Test CRAMContainerStream aggregating multiple partitions") public void testCRAMContainerAggregatePartitions() throws IOException { - final SAMFileHeader header = createSAMHeader(SAMFileHeader.SortOrder.coordinate); + final SAMFileHeader header = CRAMTestUtils.addFakeSequenceMD5s(createSAMHeader(SAMFileHeader.SortOrder.coordinate)); final ReferenceSource refSource = createReferenceSource(); // create a bunch of records and write them out to separate streams in groups @@ -161,7 +161,7 @@ public void testCRAMContainerStreamWithBaiIndex() throws IOException { @Test(description = "Test CRAMContainerStream with crai index") public void testCRAMContainerStreamWithCraiIndex() throws IOException { final List samRecords = createRecords(100); - final SAMFileHeader header = createSAMHeader(SAMFileHeader.SortOrder.coordinate); + final SAMFileHeader header = CRAMTestUtils.addFakeSequenceMD5s(createSAMHeader(SAMFileHeader.SortOrder.coordinate)); try (ByteArrayOutputStream outStream = new ByteArrayOutputStream(); ByteArrayOutputStream indexStream = new ByteArrayOutputStream()) { doTestWithIndexer(samRecords, outStream, header, new CRAMCRAIIndexer(indexStream, header)); @@ -171,6 +171,19 @@ public void testCRAMContainerStreamWithCraiIndex() throws IOException { } } + @Test(expectedExceptions=RuntimeException.class) + public void testRejectDictionaryWithMissingMD5s() { + final SAMFileHeader samHeader = createSAMHeader(SAMFileHeader.SortOrder.coordinate); + final ByteArrayOutputStream outStream = new ByteArrayOutputStream(); + final ReferenceSource refSource = createReferenceSource(); + try { + new CRAMContainerStreamWriter(outStream, null, refSource, samHeader, "test"); + } catch (final RuntimeException e) { + Assert.assertTrue(e.getMessage().contains("missing the required MD5 checksum")); + throw e; + } + } + private void checkCRAMContainerStream(ByteArrayOutputStream outStream, ByteArrayOutputStream indexStream, String indexExtension) throws IOException { // write the file out final File cramTempFile = File.createTempFile("cramContainerStreamTest", ".cram"); diff --git a/src/test/java/htsjdk/samtools/CRAMFileWriterTest.java b/src/test/java/htsjdk/samtools/CRAMFileWriterTest.java index c61f9acd49..a14fdfe277 100644 --- a/src/test/java/htsjdk/samtools/CRAMFileWriterTest.java +++ b/src/test/java/htsjdk/samtools/CRAMFileWriterTest.java @@ -170,7 +170,7 @@ private void doTest(final List samRecords) { final ReferenceSource refSource = createReferenceSource(); final ByteArrayOutputStream os = new ByteArrayOutputStream(); - try (CRAMFileWriter writer = new CRAMFileWriter(os, refSource, header, null)) { + try (CRAMFileWriter writer = new CRAMFileWriter(os, refSource, CRAMTestUtils.addFakeSequenceMD5s(header), null)) { writeRecordsToCRAM(writer, samRecords); } @@ -185,7 +185,7 @@ public void testCRAMWriterWithIndex() { final ByteArrayOutputStream indexStream = new ByteArrayOutputStream(); final List samRecords = createRecords(100); - try (CRAMFileWriter writer = new CRAMFileWriter(outStream, indexStream, refSource, header, null)) { + try (CRAMFileWriter writer = new CRAMFileWriter(outStream, indexStream, refSource, CRAMTestUtils.addFakeSequenceMD5s(header), null)) { writeRecordsToCRAM(writer, samRecords); } @@ -201,7 +201,8 @@ public void testCRAMWriterNotPresorted() { final ByteArrayOutputStream indexStream = new ByteArrayOutputStream(); final List samRecords = createRecords(100); - try (CRAMFileWriter writer = new CRAMFileWriter(outStream, indexStream, false, refSource, header, null)) { + final SAMFileHeader repairedHeader = CRAMTestUtils.addFakeSequenceMD5s(header); + try (CRAMFileWriter writer = new CRAMFileWriter(outStream, indexStream, false, refSource, repairedHeader, null)) { // force records to not be coordinate sorted to ensure we're relying on presorted=false samRecords.sort(new SAMRecordCoordinateComparator().reversed()); @@ -246,7 +247,7 @@ public void test_roundtrip_tlen_preserved() throws IOException { final List records = new ArrayList<>(); try (SamReader reader = SamReaderFactory.make().open(new File(SAM_TOOLS_TEST_DIR, "cram_tlen_reads.sorted.sam")); - CRAMFileWriter writer = new CRAMFileWriter(baos, source, reader.getFileHeader(), "test.cram")) { + CRAMFileWriter writer = new CRAMFileWriter(baos, source, CRAMTestUtils.addFakeSequenceMD5s(reader.getFileHeader()), "test.cram")) { for (final SAMRecord record : reader) { writer.addAlignment(record); records.add(record); @@ -283,7 +284,10 @@ public void test_roundtrip_many_reads() throws IOException { IOUtil.deleteOnExit(ReferenceSequenceFileFactory.getDefaultDictionaryForReferenceSequence(newFasta)); final SAMRecordSetBuilder samRecordSetBuilder = new SAMRecordSetBuilder(); - samRecordSetBuilder.setHeader(SAMRecordSetBuilder.makeDefaultHeader(SAMFileHeader.SortOrder.coordinate, 10_000, true)); + samRecordSetBuilder.setHeader( + CRAMTestUtils.addFakeSequenceMD5s( + SAMRecordSetBuilder.makeDefaultHeader(SAMFileHeader.SortOrder.coordinate, 10_000, true)) + ); samRecordSetBuilder.writeRandomReference(newFasta); final List records = new ArrayList<>(); diff --git a/src/test/java/htsjdk/samtools/CRAMFileWriterWithIndexTest.java b/src/test/java/htsjdk/samtools/CRAMFileWriterWithIndexTest.java index 5f62340014..566f16bf46 100644 --- a/src/test/java/htsjdk/samtools/CRAMFileWriterWithIndexTest.java +++ b/src/test/java/htsjdk/samtools/CRAMFileWriterWithIndexTest.java @@ -199,12 +199,14 @@ public void beforeTest() { private static void addRandomSequence(SAMFileHeader header, int length, InMemoryReferenceSequenceFile rsf) { String name = String.valueOf(header.getSequenceDictionary().size() + 1); - header.addSequence(new SAMSequenceRecord(name, length)); + final SAMSequenceRecord sequenceRecord = new SAMSequenceRecord(name, length); byte[] refBases = new byte[length]; byte[] alphabet = "ACGTN".getBytes(); - for (int i = 0; i < refBases.length; i++) + for (int i = 0; i < refBases.length; i++) { refBases[i] = alphabet[random.nextInt(alphabet.length)]; - + } rsf.add(name, refBases); + sequenceRecord.setComputedMd5(refBases); + header.addSequence(sequenceRecord); } } diff --git a/src/test/java/htsjdk/samtools/CRAMIndexPermutationsTests.java b/src/test/java/htsjdk/samtools/CRAMIndexPermutationsTests.java index 39c9c42874..024b702206 100644 --- a/src/test/java/htsjdk/samtools/CRAMIndexPermutationsTests.java +++ b/src/test/java/htsjdk/samtools/CRAMIndexPermutationsTests.java @@ -24,11 +24,12 @@ public class CRAMIndexPermutationsTests extends HtsjdkTest { private static final File TEST_DATA_DIR = new File("src/test/resources/htsjdk/samtools/cram"); - // BAM test file for comparison with CRAM results. This is asubset of NA12878 + // BAM test file for comparison with CRAM results. This is a subset of NA12878 // (20:10000000-10004000, 21:10000000-10004000, +2000 unmapped). There is no corresponding // reference checked in for this file since its too big, but because we're only using it to validate // index query results, we only care that the right the reads are returned, not what the read bases are, so - // we use a synthetic in-memory fake reference of all 'N's to convert it to CRAM. + // we use a synthetic in-memory fake reference of all 'N's to convert it to CRAM. Note that the MD tags in + // this bam are also synthetic (fake/made up in order to satisfy the MD5 requirement for write). private static final File truthBAM = new File(TEST_DATA_DIR, "NA12878.20.21.unmapped.orig.bam"); //Note that these tests can be REALLY slow because although we don't use the real (huge) reference, we use a fake diff --git a/src/test/java/htsjdk/samtools/CRAMMergerTest.java b/src/test/java/htsjdk/samtools/CRAMMergerTest.java index 34f793cc84..8a5b3858e6 100644 --- a/src/test/java/htsjdk/samtools/CRAMMergerTest.java +++ b/src/test/java/htsjdk/samtools/CRAMMergerTest.java @@ -208,7 +208,11 @@ public void test() throws IOException { // 1. Read an input CRAM and write it out in partitioned form (header, parts, terminator) ReferenceSource referenceSource = new ReferenceSource(CRAM_REF); try (SamReader samReader = SamReaderFactory.makeDefault().referenceSource(referenceSource).open(CRAM_FILE); - PartitionedCRAMFileWriter partitionedCRAMFileWriter = new PartitionedCRAMFileWriter(outputDir, referenceSource, samReader.getFileHeader(), 250)) { + PartitionedCRAMFileWriter partitionedCRAMFileWriter = new PartitionedCRAMFileWriter( + outputDir, + referenceSource, + CRAMTestUtils.addFakeSequenceMD5s(samReader.getFileHeader()), + 250)) { for (SAMRecord samRecord : samReader) { partitionedCRAMFileWriter.addAlignment(samRecord); } diff --git a/src/test/java/htsjdk/samtools/CRAMSliceMD5Test.java b/src/test/java/htsjdk/samtools/CRAMSliceMD5Test.java index a80ea59035..b85fe9575d 100644 --- a/src/test/java/htsjdk/samtools/CRAMSliceMD5Test.java +++ b/src/test/java/htsjdk/samtools/CRAMSliceMD5Test.java @@ -132,7 +132,7 @@ record = new SAMRecord(samFileHeader); // write a valid CRAM with a valid reference source: final ByteArrayOutputStream baos = new ByteArrayOutputStream(); - try (final CRAMFileWriter writer = new CRAMFileWriter(baos, referenceSourceUpperCased, samFileHeader, "test")) { + try (final CRAMFileWriter writer = new CRAMFileWriter(baos, referenceSourceUpperCased, CRAMTestUtils.addFakeSequenceMD5s(samFileHeader), "test")) { writer.addAlignment(record); } cramData = baos.toByteArray(); diff --git a/src/test/java/htsjdk/samtools/CRAMTestUtils.java b/src/test/java/htsjdk/samtools/CRAMTestUtils.java index 2476ce7783..ba1f8a0a01 100644 --- a/src/test/java/htsjdk/samtools/CRAMTestUtils.java +++ b/src/test/java/htsjdk/samtools/CRAMTestUtils.java @@ -10,6 +10,7 @@ import java.nio.file.Files; import java.util.Arrays; import java.util.Collection; +import java.util.List; public final class CRAMTestUtils { @@ -45,7 +46,7 @@ public static long writeToCRAMWithEncodingStrategy( null, true, referenceSource, - reader.getFileHeader(), + CRAMTestUtils.addFakeSequenceMD5s(reader.getFileHeader()), tempOutputCRAM.getName()); final SAMRecordIterator inputIterator = reader.iterator(); while (inputIterator.hasNext()) { @@ -67,7 +68,7 @@ public static CRAMFileReader writeAndReadFromInMemoryCram(Collection refFile.add("chr1", ref); ReferenceSource source = new ReferenceSource(refFile); final SAMFileHeader header = records.iterator().next().getHeader(); - return writeAndReadFromInMemoryCram(records, source, header); + return writeAndReadFromInMemoryCram(records, source, CRAMTestUtils.addFakeSequenceMD5s(header)); } /** @@ -76,7 +77,7 @@ public static CRAMFileReader writeAndReadFromInMemoryCram(Collection * @return a CRAMFileReader reading from an in memory buffer that has had the records written into it, uses a fake reference with all A's */ public static CRAMFileReader writeAndReadFromInMemoryCram(SAMRecordSetBuilder records) throws IOException { - return writeAndReadFromInMemoryCram(records.getRecords(), getFakeReferenceSource(), records.getHeader()); + return writeAndReadFromInMemoryCram(records.getRecords(), getFakeReferenceSource(), CRAMTestUtils.addFakeSequenceMD5s(records.getHeader())); } private static CRAMFileReader writeAndReadFromInMemoryCram(Collection records, CRAMReferenceSource source, SAMFileHeader header) throws IOException { @@ -102,4 +103,21 @@ public static CRAMReferenceSource getFakeReferenceSource() { return bases; }; } + + // Synthesize fake MD5 values for test header sequences for which we don't have actual references + // (only fake in-memory references) or for which we can't re-create the CRAM, in order to satisfy + // the CRAM writer's requirement that all sequence dictionaries have MD5 checksums. + public static SAMFileHeader addFakeSequenceMD5s(final SAMFileHeader header) { + final String FAKE_MD5 = "deadbeefdeadbeefdeadbeefdeadbeef"; + final SAMSequenceDictionary headerDictionary = header.getSequenceDictionary(); + if (!headerDictionary.getSequencesWithMissingMD5s().isEmpty()) { + final SAMFileHeader repairedHeader = header.clone(); + final SAMSequenceDictionary newDictionary = repairedHeader.getSequenceDictionary(); + final List badSequences = newDictionary.getSequencesWithMissingMD5s(); + badSequences.forEach(seq -> seq.setMd5(FAKE_MD5)); + return repairedHeader; + } + return header; + } + } diff --git a/src/test/java/htsjdk/samtools/SAMFileWriterFactoryTest.java b/src/test/java/htsjdk/samtools/SAMFileWriterFactoryTest.java index 7046996167..ed9a71313b 100644 --- a/src/test/java/htsjdk/samtools/SAMFileWriterFactoryTest.java +++ b/src/test/java/htsjdk/samtools/SAMFileWriterFactoryTest.java @@ -26,6 +26,9 @@ import com.google.common.jimfs.Configuration; import com.google.common.jimfs.Jimfs; import htsjdk.HtsjdkTest; +import htsjdk.beta.io.IOPathUtils; +import htsjdk.io.HtsPath; +import htsjdk.io.IOPath; import htsjdk.samtools.cram.ref.ReferenceSource; import htsjdk.samtools.seekablestream.SeekableFileStream; import htsjdk.samtools.util.FileExtensions; @@ -255,7 +258,6 @@ private SAMFileWriterFactory createWriterFactoryWithOptions(final SAMFileHeader factory.setCreateMd5File(true); // index only created if coordinate sorted header.setSortOrder(SAMFileHeader.SortOrder.coordinate); - header.addSequence(new SAMSequenceRecord("chr1", 123)); header.addReadGroup(new SAMReadGroupRecord("1")); return factory; } @@ -439,6 +441,48 @@ public void testMakeCRAMWriterPresortedDefault() throws Exception { verifyWriterOutput(outputFile, new ReferenceSource(referenceFile), nRecs, true); } + @Test(expectedExceptions=RuntimeException.class) + public void testRejectMissingMD5WithNoReferenceDictionary() throws IOException { + final IOPath testCRAM = new HtsPath(TEST_DATA_DIR + "/cram/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.10m-10m100.cram"); + final IOPath originalReference = new HtsPath("src/test/resources/htsjdk/samtools/reference/human_g1k_v37.20.21.fasta.gz"); + + // this reference has a reference dictionary, so copy it so it won't be found in order to test that + // the lack of a dictionary causes a cram with no MD5s to be rejected + final IOPath referenceCopy = IOPathUtils.createTempPath("fastaWithNoDictionary", ".fasta"); + final IOPath outputFile = IOPathUtils.createTempPath("fastaWithNoDictionaryTest", ".cram"); + IOUtil.copyFile(originalReference.toPath().toFile(), referenceCopy.toPath().toFile()); + + try (final SamReader samReader = SamReaderFactory.make().open(testCRAM.toPath().toFile()); + final SAMFileWriter samWriter = new SAMFileWriterFactory().makeWriter( + samReader.getFileHeader(), true, outputFile.toPath().toFile(), originalReference.toPath().toFile())) { + } catch (final RuntimeException e) { + Assert.assertTrue(e.getMessage().contains("The attempt to repair the missing sequence MD5s")); + throw e; + } + } + + @Test + public void testRepairMissingMD5sFromReferenceDictionary() throws IOException { + // use a CRAM that has a header sequence dictionary with no MD5s, and a corresponding reference + // that has a ref .dict + final IOPath testCRAM = new HtsPath(TEST_DATA_DIR + "/cram/cramQueryTest.cram"); + final IOPath reference = new HtsPath(TEST_DATA_DIR + "/cram/human_g1k_v37.20.21.1-100.fasta"); + + final IOPath outputFile = IOPathUtils.createTempPath("fastaWithNoDictionaryTest", ".cram"); + try (final SamReader samReader = SamReaderFactory.make().open(testCRAM.toPath().toFile()); + // no need to transfer the records, only the header + final SAMFileWriter samWriter = new SAMFileWriterFactory().makeWriter( + samReader.getFileHeader(), true, outputFile.toPath().toFile(), reference.toPath().toFile())) { + // make sure the original header has no MD5s, otherwise the test won't prove anything + Assert.assertTrue(!samReader.getFileHeader().getSequenceDictionary().getSequencesWithMissingMD5s().isEmpty()); + } + + //now make sure the newly written file contains md5s + try (final SamReader samReader = SamReaderFactory.make().open(outputFile.toPath().toFile())) { + Assert.assertTrue(samReader.getFileHeader().getSequenceDictionary().getSequencesWithMissingMD5s().isEmpty()); + } + } + @Test public void testAsync() throws IOException { final SAMFileWriterFactory builder = new SAMFileWriterFactory(); diff --git a/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java b/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java index 9b0ebdc6b1..548a5eb935 100644 --- a/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java +++ b/src/test/java/htsjdk/samtools/SAMRecordUnitTest.java @@ -1161,7 +1161,7 @@ public void testWriteSamWithEmptyArray(Object emptyArray, Class arrayClass, S .setCreateMd5File(false) .setCreateIndex(false); final Path reference = IOUtil.getPath("src/test/resources/htsjdk/samtools/one-contig.fasta"); - try (final SAMFileWriter samFileWriter = writerFactory.makeWriter(samRecords.getHeader(), false, tmp, reference)) { + try (final SAMFileWriter samFileWriter = writerFactory.makeWriter(CRAMTestUtils.addFakeSequenceMD5s(samRecords.getHeader()), false, tmp, reference)) { samFileWriter.addAlignment(record); } diff --git a/src/test/java/htsjdk/samtools/cram/LosslessRoundTripTest.java b/src/test/java/htsjdk/samtools/cram/LosslessRoundTripTest.java index 1ae8e142aa..3478e9fdf1 100644 --- a/src/test/java/htsjdk/samtools/cram/LosslessRoundTripTest.java +++ b/src/test/java/htsjdk/samtools/cram/LosslessRoundTripTest.java @@ -24,6 +24,7 @@ public void test_MD_NM() throws IOException { samFileHeader.addSequence(new SAMSequenceRecord("1", 3)); samFileHeader.addReadGroup(new SAMReadGroupRecord("some read group")); + samFileHeader = CRAMTestUtils.addFakeSequenceMD5s(samFileHeader); CRAMFileWriter w = new CRAMFileWriter(baos, source, samFileHeader, null); SAMRecord record = new SAMRecord(samFileHeader); record.setReadName("name"); diff --git a/src/test/java/htsjdk/samtools/cram/structure/CRAMStructureTestHelper.java b/src/test/java/htsjdk/samtools/cram/structure/CRAMStructureTestHelper.java index ad6ca4f31b..ce7adf9ab2 100644 --- a/src/test/java/htsjdk/samtools/cram/structure/CRAMStructureTestHelper.java +++ b/src/test/java/htsjdk/samtools/cram/structure/CRAMStructureTestHelper.java @@ -418,7 +418,8 @@ private static SAMFileHeader createSAMFileHeader() { final List sequenceRecords = new ArrayList<>(); sequenceRecords.add(new SAMSequenceRecord("0", REFERENCE_CONTIG_LENGTH)); sequenceRecords.add(new SAMSequenceRecord("1", REFERENCE_CONTIG_LENGTH)); - final SAMFileHeader header = new SAMFileHeader(new SAMSequenceDictionary(sequenceRecords)); + final SAMFileHeader header = CRAMTestUtils.addFakeSequenceMD5s( + new SAMFileHeader(new SAMSequenceDictionary(sequenceRecords))); header.setSortOrder(SAMFileHeader.SortOrder.coordinate); return header; } diff --git a/src/test/resources/htsjdk/samtools/cram/NA12878.20.21.unmapped.orig.bai b/src/test/resources/htsjdk/samtools/cram/NA12878.20.21.unmapped.orig.bai index 2106f10cc7..7bace0354c 100644 Binary files a/src/test/resources/htsjdk/samtools/cram/NA12878.20.21.unmapped.orig.bai and b/src/test/resources/htsjdk/samtools/cram/NA12878.20.21.unmapped.orig.bai differ diff --git a/src/test/resources/htsjdk/samtools/cram/NA12878.20.21.unmapped.orig.bam b/src/test/resources/htsjdk/samtools/cram/NA12878.20.21.unmapped.orig.bam index 01ad319f8b..aa10eeaa21 100644 Binary files a/src/test/resources/htsjdk/samtools/cram/NA12878.20.21.unmapped.orig.bam and b/src/test/resources/htsjdk/samtools/cram/NA12878.20.21.unmapped.orig.bam differ