Skip to content

Commit f5a0396

Browse files
committed
BUG Fix important sequence reinjection bug
Before, it was possible that the wrong mate would be reinjected, leading to length mismatches
1 parent 84e6c4a commit f5a0396

File tree

5 files changed

+64
-41
lines changed

5 files changed

+64
-41
lines changed

ChangeLog

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
Version 1.1.0 2019-01-25 by luispedro
1+
Version 1.1.0+
2+
* Fix CIGAR reinjection bug
3+
4+
Version 1.1.0 2020-01-25 by luispedro
25
* Reintroduce zstd compression (after fixes upstream)
36
* Fix CIGAR interpretation (#109) occurring when I is present
47
* Call bwa mem so that it behaves in a deterministic way (independently of

NGLess/Interpret.hs

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ import Interpretation.Count (executeCount, executeCountCheck)
103103
import Interpretation.CountFile (executeCountFile)
104104
import Interpretation.FastQ
105105
import Interpretation.Write
106-
import Interpretation.Select (executeSelect, executeMappedReadMethod, splitSamlines3, fixCigar)
106+
import Interpretation.Select (executeSelect, executeMappedReadMethod, reinjectSequences)
107107
import Interpretation.Unique
108108
import Interpretation.Substrim
109109
import Utils.Utils
@@ -589,25 +589,6 @@ executeSelectWBlock input@NGOMappedReadSet{ nglSamFile = isam} args (Block [Vari
589589
_ -> nglTypeError ("Expected variable "++show var++" to contain a mapped read.")
590590

591591
else return []
592-
reinjectSequences :: [SamLine] -> [SamLine] -> NGLess [SamLine]
593-
reinjectSequences original filtered = case (splitSamlines3 original, splitSamlines3 filtered) of
594-
((o1, o2, os), (f1, f2, fs)) -> do
595-
r1 <- reinjectSequences' o1 f1
596-
r2 <- reinjectSequences' o2 f2
597-
ss <- reinjectSequences' os fs
598-
return (r1 ++ r2 ++ ss)
599-
reinjectSequences' :: [SamLine] -> [SamLine] -> NGLess [SamLine]
600-
reinjectSequences' original f@(s@SamLine{}:rs)
601-
| not (any hasSequence f) = let
602-
fixed = flip map (filter hasSequence original) $ \s' -> do
603-
cigar <- fixCigar (samCigar s) (samLength s')
604-
return (s { samSeq = samSeq s', samQual = samQual s', samCigar = cigar }:rs)
605-
asum' [] = return f
606-
asum' (x:xs) = case x of
607-
Right{} -> x
608-
Left{} -> asum' xs
609-
in asum' fixed
610-
reinjectSequences' _ f = return f
611592
executeSelectWBlock expr _ _ = unreachable ("Select with block, unexpected argument: " ++ show expr)
612593

613594

NGLess/Interpretation/Select.hs

Lines changed: 48 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
{- Copyright 2015-2019 NGLess Authors
1+
{- Copyright 2015-2020 NGLess Authors
22
- License: MIT
33
-}
44

@@ -7,8 +7,8 @@
77
module Interpretation.Select
88
( executeSelect
99
, executeMappedReadMethod
10-
, splitSamlines3
1110
, fixCigar
11+
, reinjectSequences
1212
) where
1313

1414
import qualified Data.ByteString as B
@@ -25,7 +25,7 @@ import qualified Data.Text.Encoding as TE
2525
import Data.Bits (Bits(..))
2626
import Control.Monad.Except (throwError)
2727
import Data.Either.Combinators (fromRight)
28-
import Data.List (foldl', find)
28+
import Data.List (foldl')
2929
import Data.Either.Extra (eitherToMaybe)
3030
import Data.Tuple.Extra (fst3)
3131
import Data.Ratio (Ratio, (%))
@@ -34,7 +34,7 @@ import Data.Maybe
3434

3535
import Data.Sam
3636
import FileManagement
37-
import FileOrStream
37+
import FileOrStream (asSamStream, FileOrStream(..))
3838
import Language
3939
import Output
4040

@@ -81,24 +81,52 @@ _parseConditions args = do
8181
-- 2) we want to keep the full sequence, so we want to use soft trimming (if
8282
-- at all possible)
8383
matchConditions :: Bool -> MatchCondition -> [(SamLine,B.ByteString)] -> NGLess [(SamLine, B.ByteString)]
84-
matchConditions doReinject conds sg = reinjectSequences doReinject (matchConditions' conds sg)
84+
matchConditions doReinject conds sg =
85+
let sg' = matchConditions' conds sg
86+
in if doReinject
87+
then reinjectSequencesIfNeeded sg sg'
88+
else pure sg'
89+
90+
toStrictBS :: BB.Builder -> B.ByteString
91+
toStrictBS = BL.toStrict . BB.toLazyByteString
92+
93+
94+
reinjectSequencesIfNeeded :: [(SamLine,B.ByteString)] -> [(SamLine,B.ByteString)] -> NGLess [(SamLine,B.ByteString)]
95+
reinjectSequencesIfNeeded _ filtered@((SamHeader{},_):_) = return filtered
96+
reinjectSequencesIfNeeded orig filtered =
97+
if needsReinject (fmap fst filtered)
98+
then do
99+
filtered' <- reinjectSequences (fmap fst orig) (fmap fst filtered)
100+
return $ fmap (\s -> (s, toStrictBS $ encodeSamLine s)) filtered'
101+
else return filtered
85102
where
86-
reinjectSequences True f@((s@SamLine{}, _):rs)
87-
| not (any (hasSequence . fst) f) && any (hasSequence . fst) sg
88-
= do
89-
s' <- addSequence s
90-
return ((s', toStrictBS $ encodeSamLine s'):rs)
91-
reinjectSequences _ f = return f
92-
93-
toStrictBS :: BB.Builder -> B.ByteString
94-
toStrictBS = BL.toStrict . BB.toLazyByteString
95-
96-
addSequence s = case find hasSequence (fst <$> sg) of
97-
Just s'@SamLine{} -> do
98-
cigar' <- fixCigar (samCigar s) (B.length $ samSeq s')
99-
return s { samSeq = samSeq s', samQual = samQual s', samCigar = cigar' }
100-
_ -> return s
103+
needsReinject :: [SamLine] -> Bool
104+
needsReinject fs = let (fs1, fs2, fs0) = splitSamlines3 fs
105+
in needsReinject' fs1 || needsReinject' fs2 || needsReinject' fs0
106+
needsReinject' :: [SamLine] -> Bool
107+
needsReinject' [] = False
108+
needsReinject' xs = not (any hasSequence xs)
101109

110+
-- See note above on "Sequence reinjection" about why this function is necessary
111+
reinjectSequences :: [SamLine] -> [SamLine] -> NGLess [SamLine]
112+
reinjectSequences original filtered = case (splitSamlines3 original, splitSamlines3 filtered) of
113+
((o1, o2, os), (f1, f2, fs)) -> do
114+
r1 <- reinjectSequences' o1 f1
115+
r2 <- reinjectSequences' o2 f2
116+
ss <- reinjectSequences' os fs
117+
return (r1 ++ r2 ++ ss)
118+
reinjectSequences' :: [SamLine] -> [SamLine] -> NGLess [SamLine]
119+
reinjectSequences' original f@(s@SamLine{}:rs)
120+
| not (any hasSequence f) = let
121+
fixed = flip map (filter hasSequence original) $ \s' -> do
122+
cigar <- fixCigar (samCigar s) (samLength s')
123+
return (s { samSeq = samSeq s', samQual = samQual s', samCigar = cigar }:rs)
124+
asum' [] = return f
125+
asum' (x:xs) = case x of
126+
Right{} -> x
127+
Left{} -> asum' xs
128+
in asum' fixed
129+
reinjectSequences' _ f = return f
102130
-- See note above on "Sequence reinjection" about why this function is necessary
103131
fixCigar :: B.ByteString -> Int -> NGLess B.ByteString
104132
fixCigar prev n = do

tests/fix_cigarSam/fixCigar.ngl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ mappedBlock = select(mapped) using |mr|:
77
discard
88

99
mappedCall = select(mapped, keep_if=[{mapped}])
10+
write(mappedCall, ofile='output.sam')

tests/fix_cigarSam/reinject.sam

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
@SQ SN:SAMN04435864.232 LN:3042917
2+
@SQ SN:SAMN04487766.1000003 LN:272134
3+
@SQ SN:SAMEA4471185.1000011 LN:381280
4+
@SQ SN:SAMN03197959.1000103 LN:43533
15
@SQ SN:4-2_gene39770 LN:1547
26
@SQ SN:7-35611_1_gene245044 LN:1547
37
@SQ SN:5-17872_1_gene78296 LN:1547
@@ -41,3 +45,9 @@ ER9 377 7-ene66981 13 0 22H45M = 13 0 * * NM:i:1 MD:Z:16A28 AS:i:40
4145
ER9 329 4-ene6183 1494 0 45M22H = 1494 0 * * NM:i:1 MD:Z:28T16 AS:i:40
4246
ER9 377 5-ene109198 13 0 22H45M = 13 0 * * NM:i:1 MD:Z:16A28 AS:i:40
4347
ER9 2121 g944564.1178 1508 0 28H39M = 1508 0 GCTAAAAAAGCTCCTTTCTATCGTATTGTTGTTGCAGAC DFGHGGIJIIGCEGHHFECHF@DEDC@C>CB=AAC>;:A NM:i:2 MD:Z:26C8T3 AS:i:30 XS:i:0 SA:Z:4-12192_1_gene91203,1504,+,44M23S,0,0;
48+
GW1808313582 117 * 136740 0 * * 136740 0 GGCAGTGAAGAGGAGGCGGGGAGGTCCCCTGGCGCTCCCGCTGCATGTATGATCGCGGGGTGGTCAAGCAGATCTGCAAGATCGCCTTTATACGGACAGTCATGCCGGCCGCAGCAGTCGCTCTTATCGCGCTCACCGTCTGGGACTGAT JJJAJA<<JJJJJJJJJJJJFJJJJJJFJJAAJJFJFJJJJJJJJJJFJJJJJJJJJJJJJJJJJJJJJJJJJFJJJJJJJFJFJJ<JJFAJJJAFJJJJFJFF<JJJJAJJJJJJFJJJJJJJJJJJJJJFFFAA<JJ<JJJFJFFFAA MC:Z:79M AS:i:0 XS:i:0
49+
GW1808313582 189 * 136740 7 * * 136740 0 CGCCCGCGGCTCAGAAAAAGGGACCTGCCACGTGCGCGCGGGCGGCTTTTAGATGAACGTCCACGACTCCGAGCACATG JJJJJJJFJJJJFJJ<AJJJJFF<JF<JAAFFAJJJJJJJJJJJJJJFJJJJJFJJJJFJJFJJJFJJ<<FFJJFFFAA NM:i:3 MD:Z:6A2A0G68 AS:i:68 XS:i:64
50+
GW1808313582 393 SAMN04435864.232 80639 0 64M15H = 80639 0 * * NM:i:0 MD:Z:64 AS:i:64
51+
GW1808313582 393 SAMN04487766.1000003 230499 0 60M19H = 230499 0 * * NM:i:0 MD:Z:60 AS:i:60
52+
GW1808313582 393 SAMEA4471185.1000011 18883 0 61M18H = 18883 0 * * NM:i:1 MD:Z:54A6 AS:i:56
53+
GW1808313582 393 SAMN03197959.1000103 16191 0 60M19H = 16191 0 * * NM:i:1 MD:Z:36C23 AS:i:55

0 commit comments

Comments
 (0)