From 813989c02c714a7acb9990c2dec783e88f6df48e Mon Sep 17 00:00:00 2001 From: Anton-Latukha Date: Sat, 25 Sep 2021 11:08:36 +0300 Subject: [PATCH 1/7] Text: add fun isSubsequenceOf Function to detect ordered subsequences in the content. --- src/Data/Text.hs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index b1f3a189..262bf821 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -167,6 +167,7 @@ module Data.Text , isPrefixOf , isSuffixOf , isInfixOf + , isSubsequenceOf -- ** View patterns , stripPrefix @@ -1881,6 +1882,22 @@ isInfixOf needle haystack | otherwise = not . L.null . indices needle $ haystack {-# INLINE [1] isInfixOf #-} +-- | The 'isSubsequenceOf' function takes two 'Text's and returns +-- 'True' iff the first is a subsequence of a second. +-- (characters of the first argument appear in same sequential order in +-- the second, to say if first argument that can be derived by deleting some +-- or no elements from the second). +isSubsequenceOf :: Text -> Text -> Bool +isSubsequenceOf sf tf + | length sf > length tf = False + | otherwise = subseqOf sf tf + where + subseqOf s t + | null s = True + | null t = False + | unsafeHead s == unsafeHead t = subseqOf (unsafeTail s) (unsafeTail t) + | otherwise = subseqOf s $ unsafeTail t + ------------------------------------------------------------------------------- -- * View patterns From c53b59f4d40a8a8e531cd0652037421ed7b57e32 Mon Sep 17 00:00:00 2001 From: Anton-Latukha Date: Sat, 25 Sep 2021 13:53:15 +0300 Subject: [PATCH 2/7] Text: flip isSubsequenceOf Previous argument order was alike `lookup` argument order mistake, when key cached (compiler saturates function) across DBs. Efficient use of the function is to all it once (per pattern) on the whole contents. So the contents should be cached first, and the search patterns are switchable. --- src/Data/Text.hs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 262bf821..2136f0dc 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -1883,20 +1883,20 @@ isInfixOf needle haystack {-# INLINE [1] isInfixOf #-} -- | The 'isSubsequenceOf' function takes two 'Text's and returns --- 'True' iff the first is a subsequence of a second. --- (characters of the first argument appear in same sequential order in --- the second, to say if first argument that can be derived by deleting some --- or no elements from the second). +-- 'True' iff the second is a subsequence of the first. +-- (characters of the second argument appear in same sequential order in +-- the first, to say if second argument can be derived by deleting some +-- or no elements from the first). isSubsequenceOf :: Text -> Text -> Bool -isSubsequenceOf sf tf +isSubsequenceOf tf sf | length sf > length tf = False - | otherwise = subseqOf sf tf + | otherwise = subseqOf tf sf where - subseqOf s t + subseqOf t s | null s = True | null t = False - | unsafeHead s == unsafeHead t = subseqOf (unsafeTail s) (unsafeTail t) - | otherwise = subseqOf s $ unsafeTail t + | unsafeHead s == unsafeHead t = subseqOf (unsafeTail t) (unsafeTail s) + | otherwise = subseqOf (unsafeTail t) s ------------------------------------------------------------------------------- -- * View patterns From 173c1ae51b4a9cb77a67f7baac6350bb0fb626fe Mon Sep 17 00:00:00 2001 From: Anton-Latukha Date: Tue, 28 Sep 2021 12:34:35 +0300 Subject: [PATCH 3/7] WIP: Text: isSubsequenceOf: ({null, unsafeHead, unsafeTail} -> uncons) As suggested, indeed `uncons` replaces 3 funciton invocations with 1. --- src/Data/Text.hs | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 2136f0dc..f5601aa5 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -254,6 +254,8 @@ import qualified Language.Haskell.TH.Lib as TH import qualified Language.Haskell.TH.Syntax as TH import Text.Printf (PrintfArg, formatArg, formatString) import System.Posix.Types (CSsize(..)) +import Data.Maybe (maybe) +import Data.Bool (bool) -- $setup -- >>> import Data.Text @@ -1892,11 +1894,23 @@ isSubsequenceOf tf sf | length sf > length tf = False | otherwise = subseqOf tf sf where - subseqOf t s - | null s = True - | null t = False - | unsafeHead s == unsafeHead t = subseqOf (unsafeTail t) (unsafeTail s) - | otherwise = subseqOf (unsafeTail t) s + subseqOf :: Text -> Text -> Bool + subseqOf t s = + maybe + True + (\ (sc,ss) -> + maybe + False + (\ (tc,ts) -> + subseqOf ts $ + bool + s + ss + (sc /= tc) + ) + (uncons t) + ) + (uncons s) ------------------------------------------------------------------------------- -- * View patterns From f477ba6b63ae6c6a3ccdb6d5dbe2907609fc34b8 Mon Sep 17 00:00:00 2001 From: Anton-Latukha Date: Tue, 28 Sep 2021 12:51:45 +0300 Subject: [PATCH 4/7] WIP: Text: isSubsequenceOf: lambda to imperative `on` now ensures that `uncons` can be specialized/consumed once. It would be interesting to look into Core at some point of work. --- src/Data/Text.hs | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index f5601aa5..e6021773 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -254,8 +254,7 @@ import qualified Language.Haskell.TH.Lib as TH import qualified Language.Haskell.TH.Syntax as TH import Text.Printf (PrintfArg, formatArg, formatString) import System.Posix.Types (CSsize(..)) -import Data.Maybe (maybe) -import Data.Bool (bool) +import Data.Function (on) -- $setup -- >>> import Data.Text @@ -1896,21 +1895,16 @@ isSubsequenceOf tf sf where subseqOf :: Text -> Text -> Bool subseqOf t s = - maybe - True - (\ (sc,ss) -> - maybe - False - (\ (tc,ts) -> - subseqOf ts $ - bool - s - ss - (sc /= tc) - ) - (uncons t) - ) - (uncons s) + on f uncons s t + where + f :: Maybe (Char, Text) -> Maybe (Char, Text) -> Bool + f Nothing _ = True + f _ Nothing = False + f (Just (sc,ss)) (Just (tc,ts)) = + subseqOf ts $ + if sc == tc + then s + else ss ------------------------------------------------------------------------------- -- * View patterns From 2a8e3d996421397f8d02444bb7f58effd4327e43 Mon Sep 17 00:00:00 2001 From: Anton-Latukha Date: Tue, 28 Sep 2021 12:59:07 +0300 Subject: [PATCH 5/7] WIP: Text: isSubsequenceOf: flip local fun This way may have more stack reuse. --- src/Data/Text.hs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index e6021773..3836d7d8 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -1895,14 +1895,14 @@ isSubsequenceOf tf sf where subseqOf :: Text -> Text -> Bool subseqOf t s = - on f uncons s t + on f uncons t s where f :: Maybe (Char, Text) -> Maybe (Char, Text) -> Bool - f Nothing _ = True - f _ Nothing = False - f (Just (sc,ss)) (Just (tc,ts)) = + f _ Nothing = True + f Nothing _ = False + f (Just (tc,ts)) (Just (sc,ss)) = subseqOf ts $ - if sc == tc + if tc == sc then s else ss From 865ada406a76490346eb23012f9f43d0b31abc83 Mon Sep 17 00:00:00 2001 From: Anton-Latukha Date: Wed, 29 Sep 2021 16:42:34 +0300 Subject: [PATCH 6/7] WIP: Text: isSubsequenceOf: upd doc (add examples of code & use) --- src/Data/Text.hs | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/Data/Text.hs b/src/Data/Text.hs index 3836d7d8..3b5fdd7e 100644 --- a/src/Data/Text.hs +++ b/src/Data/Text.hs @@ -1883,11 +1883,25 @@ isInfixOf needle haystack | otherwise = not . L.null . indices needle $ haystack {-# INLINE [1] isInfixOf #-} --- | The 'isSubsequenceOf' function takes two 'Text's and returns --- 'True' iff the second is a subsequence of the first. --- (characters of the second argument appear in same sequential order in --- the first, to say if second argument can be derived by deleting some --- or no elements from the first). +-- 2021-09-29: NOTE: +-- * after the implementation - determine & mention the big O +-- | The 'isSubsequenceOf' function takes the main text and the subsequnce +-- to find and returns 'True' iff the second argument is a subsequence +-- of the first. +-- +-- "Subsequence" used in the meaning of: characters of the second argument +-- appear in same sequential order in the main data, to say second argument can +-- be derived by deleting some (any) or no elements from the first. +-- +-- Examples: +-- +-- >>> isSubsequenceOf "1234567" "1356" +-- True +-- +-- >>> isSubsequenceOf "1234567" "21" +-- False +-- +-- `isSubsequenceOf` is the base case & implementation of fuzzy search. isSubsequenceOf :: Text -> Text -> Bool isSubsequenceOf tf sf | length sf > length tf = False From 073d676470d62ef685e549970f2dbe69a387e0ea Mon Sep 17 00:00:00 2001 From: Anton-Latukha Date: Sun, 3 Oct 2021 19:48:48 +0300 Subject: [PATCH 7/7] WIP: Tests.Properties.Substrings: add genOrdSubseq --- tests/Tests/Properties/Substrings.hs | 43 ++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/Tests/Properties/Substrings.hs b/tests/Tests/Properties/Substrings.hs index 46fa1dca..f3b5b8f4 100644 --- a/tests/Tests/Properties/Substrings.hs +++ b/tests/Tests/Properties/Substrings.hs @@ -20,6 +20,8 @@ import qualified Data.Text.Internal.Lazy as TL (Text(..)) import qualified Data.Text.Internal.Lazy.Fusion as SL import qualified Data.Text.Lazy as TL import qualified Tests.SlowFunctions as Slow +import Control.Monad (replicateM) +import Data.List (nub, sort) s_take n = L.take n `eqP` (unpackS . S.take n) s_take_s (Small n) = L.take n `eqP` (unpackS . S.unstream . S.take n) @@ -231,6 +233,47 @@ tl_isSuffixOf s = L.isSuffixOf s`eqP` TL.isSuffixOf (packS s) t_isInfixOf s = L.isInfixOf s `eqP` T.isInfixOf (packS s) tl_isInfixOf s = L.isInfixOf s `eqP` TL.isInfixOf (packS s) +-- | Generator for substrings that keeps the element order. +-- Aka: "1234567890" -> "245680" +genOrdSubseq :: T.Text -> Gen T.Text +genOrdSubseq txt = + T.pack . transform <$> genTransformMap + where + + pickN :: Gen Int + pickN = + choose (0, T.length txt) + + pickNs :: Gen [Int] + pickNs = + fmap (sort . nub) $ (`replicateM` pickN) =<< pickN + + growInst :: [Bool] -> Int -> [Bool] + growInst ls n = + ls + <> take (length ls - pred n) [True ..] + <> [False] + + mkTransformInst :: [Bool] -> [Int] -> [Bool] + mkTransformInst bls [] = + bls + <> take (T.length txt - length bls) [True ..] + mkTransformInst bls (i:is) = + mkTransformInst + (growInst bls i) + is + + mkTransformMap :: [a] -> [Int] -> [(a, Bool)] + mkTransformMap ls ixs = + zip ls (mkTransformInst mempty ixs) + + genTransformMap :: (Gen [(Char, Bool)]) + genTransformMap = fmap (mkTransformMap $ T.unpack txt) pickNs + + transform :: [(Char, Bool)] -> [Char] + transform = + foldr (\ (c, b) as -> as <> if b then [c] else mempty) mempty + t_stripPrefix s = (fmap packS . L.stripPrefix s) `eqP` T.stripPrefix (packS s) tl_stripPrefix s = (fmap packS . L.stripPrefix s) `eqP` TL.stripPrefix (packS s)