Skip to content

Commit 8d2765b

Browse files
authored
Merge pull request #194 from Bodigrim/master
Support text-2.0
2 parents 7c8a9a2 + e504380 commit 8d2765b

File tree

3 files changed

+82
-17
lines changed

3 files changed

+82
-17
lines changed

Data/Attoparsec/Text/Buffer.hs

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,8 @@ module Data.Attoparsec.Text.Buffer
3434
, iter
3535
, iter_
3636
, substring
37-
, dropWord16
37+
, lengthCodeUnits
38+
, dropCodeUnits
3839
) where
3940

4041
import Control.Exception (assert)
@@ -44,8 +45,14 @@ import Data.Monoid as Mon (Monoid(..))
4445
import Data.Semigroup (Semigroup(..))
4546
import Data.Text ()
4647
import Data.Text.Internal (Text(..))
48+
#if MIN_VERSION_text(2,0,0)
49+
import Data.Text.Internal.Encoding.Utf8 (utf8LengthByLeader)
50+
import Data.Text.Unsafe (iterArray, lengthWord8)
51+
#else
4752
import Data.Text.Internal.Encoding.Utf16 (chr2)
4853
import Data.Text.Internal.Unsafe.Char (unsafeChr)
54+
import Data.Text.Unsafe (lengthWord16)
55+
#endif
4956
import Data.Text.Unsafe (Iter(..))
5057
import Foreign.Storable (sizeOf)
5158
import GHC.Exts (Int(..), indexIntArray#, unsafeCoerce#, writeIntArray#)
@@ -108,16 +115,25 @@ append (Buf arr0 off0 len0 cap0 gen0) !arr1 !off1 !len1 = runST $ do
108115
let newgen = gen + 1
109116
marr <- unsafeThaw arr0
110117
writeGen marr newgen
118+
#if MIN_VERSION_text(2,0,0)
119+
A.copyI newlen marr (off0+len0) arr1 off1
120+
#else
111121
A.copyI marr (off0+len0) arr1 off1 (off0+newlen)
122+
#endif
112123
arr2 <- A.unsafeFreeze marr
113124
return (Buf arr2 off0 newlen cap0 newgen)
114125
else do
115126
let newcap = newlen * 2
116127
newgen = 1
117128
marr <- A.new (newcap + woff)
118129
writeGen marr newgen
130+
#if MIN_VERSION_text(2,0,0)
131+
A.copyI len0 marr woff arr0 off0
132+
A.copyI newlen marr (woff+len0) arr1 off1
133+
#else
119134
A.copyI marr woff arr0 off0 (woff+len0)
120135
A.copyI marr (woff+len0) arr1 off1 (woff+newlen)
136+
#endif
121137
arr2 <- A.unsafeFreeze marr
122138
return (Buf arr2 woff newlen newcap newgen)
123139

@@ -132,11 +148,52 @@ substring s l (Buf arr off len _ _) =
132148
Text arr (off+s) l
133149
{-# INLINE substring #-}
134150

135-
dropWord16 :: Int -> Buffer -> Text
136-
dropWord16 s (Buf arr off len _ _) =
151+
#if MIN_VERSION_text(2,0,0)
152+
153+
lengthCodeUnits :: Text -> Int
154+
lengthCodeUnits = lengthWord8
155+
156+
dropCodeUnits :: Int -> Buffer -> Text
157+
dropCodeUnits s (Buf arr off len _ _) =
158+
assert (s >= 0 && s <= len) $
159+
Text arr (off+s) (len-s)
160+
{-# INLINE dropCodeUnits #-}
161+
162+
-- | /O(1)/ Iterate (unsafely) one step forwards through a UTF-8
163+
-- array, returning the current character and the delta to add to give
164+
-- the next offset to iterate at.
165+
iter :: Buffer -> Int -> Iter
166+
iter (Buf arr off _ _ _) i = iterArray arr (off + i)
167+
{-# INLINE iter #-}
168+
169+
-- | /O(1)/ Iterate one step through a UTF-8 array, returning the
170+
-- delta to add to give the next offset to iterate at.
171+
iter_ :: Buffer -> Int -> Int
172+
iter_ (Buf arr off _ _ _) i = utf8LengthByLeader $ A.unsafeIndex arr (off+i)
173+
{-# INLINE iter_ #-}
174+
175+
unsafeThaw :: A.Array -> ST s (A.MArray s)
176+
unsafeThaw (A.ByteArray a) = ST $ \s# ->
177+
(# s#, A.MutableByteArray (unsafeCoerce# a) #)
178+
179+
readGen :: A.Array -> Int
180+
readGen (A.ByteArray a) = case indexIntArray# a 0# of r# -> I# r#
181+
182+
writeGen :: A.MArray s -> Int -> ST s ()
183+
writeGen (A.MutableByteArray a) (I# gen#) = ST $ \s0# ->
184+
case writeIntArray# a 0# gen# s0# of
185+
s1# -> (# s1#, () #)
186+
187+
#else
188+
189+
lengthCodeUnits :: Text -> Int
190+
lengthCodeUnits = lengthWord16
191+
192+
dropCodeUnits :: Int -> Buffer -> Text
193+
dropCodeUnits s (Buf arr off len _ _) =
137194
assert (s >= 0 && s <= len) $
138195
Text arr (off+s) (len-s)
139-
{-# INLINE dropWord16 #-}
196+
{-# INLINE dropCodeUnits #-}
140197

141198
-- | /O(1)/ Iterate (unsafely) one step forwards through a UTF-16
142199
-- array, returning the current character and the delta to add to give
@@ -170,3 +227,5 @@ writeGen :: A.MArray s -> Int -> ST s ()
170227
writeGen a (I# gen#) = ST $ \s0# ->
171228
case writeIntArray# (A.maBA a) 0# gen# s0# of
172229
s1# -> (# s1#, () #)
230+
231+
#endif

Data/Attoparsec/Text/Internal.hs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ string_ suspended f s0 = T.Parser $ \t pos more lose succ ->
176176
| T.null ft -> suspended s s t pos more lose succ
177177
| otherwise -> lose t pos more [] "string"
178178
Just (pfx,ssfx,tsfx)
179-
| T.null ssfx -> let l = Pos (T.lengthWord16 pfx)
179+
| T.null ssfx -> let l = Pos (Buf.lengthCodeUnits pfx)
180180
in succ t (pos + l) more (substring pos l t)
181181
| not (T.null tsfx) -> lose t pos more [] "string"
182182
| otherwise -> suspended s ssfx t pos more lose succ
@@ -195,7 +195,7 @@ stringSuspended f s000 s0 t0 pos0 more0 lose0 succ0 =
195195
in case T.commonPrefixes s0 s of
196196
Nothing -> lose t pos more [] "string"
197197
Just (_pfx,ssfx,tsfx)
198-
| T.null ssfx -> let l = Pos (T.lengthWord16 s000)
198+
| T.null ssfx -> let l = Pos (Buf.lengthCodeUnits s000)
199199
in succ t (pos + l) more (substring pos l t)
200200
| T.null tsfx -> stringSuspended f s000 ssfx t pos more lose succ
201201
| otherwise -> lose t pos more [] "string"
@@ -445,12 +445,12 @@ endOfLine = (char '\n' >> return ()) <|> (string "\r\n" >> return ())
445445

446446
-- | Terminal failure continuation.
447447
failK :: Failure a
448-
failK t (Pos pos) _more stack msg = Fail (Buf.dropWord16 pos t) stack msg
448+
failK t (Pos pos) _more stack msg = Fail (Buf.dropCodeUnits pos t) stack msg
449449
{-# INLINE failK #-}
450450

451451
-- | Terminal success continuation.
452452
successK :: Success a a
453-
successK t (Pos pos) _more a = Done (Buf.dropWord16 pos t) a
453+
successK t (Pos pos) _more a = Done (Buf.dropCodeUnits pos t) a
454454
{-# INLINE successK #-}
455455

456456
-- | Run a parser.
@@ -477,7 +477,7 @@ parseOnly m s = case runParser m (buffer s) 0 Complete failK successK of
477477

478478
get :: Parser Text
479479
get = T.Parser $ \t pos more _lose succ ->
480-
succ t pos more (Buf.dropWord16 (fromPos pos) t)
480+
succ t pos more (Buf.dropCodeUnits (fromPos pos) t)
481481
{-# INLINE get #-}
482482

483483
endOfChunk :: Parser Bool

tests/QC/Buffer.hs

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ b_length :: BPB -> Property
5151
b_length (BP _ts t buf) = B.length t === BB.length buf
5252

5353
t_length :: BPT -> Property
54-
t_length (BP _ts t buf) = T.lengthWord16 t === BT.length buf
54+
t_length (BP _ts t buf) = BT.lengthCodeUnits t === BT.length buf
5555

5656
b_unsafeIndex :: BPB -> Gen Property
5757
b_unsafeIndex (BP _ts t buf) = do
@@ -61,14 +61,14 @@ b_unsafeIndex (BP _ts t buf) = do
6161

6262
t_iter :: BPT -> Gen Property
6363
t_iter (BP _ts t buf) = do
64-
let l = T.lengthWord16 t
64+
let l = BT.lengthCodeUnits t
6565
i <- choose (0,l-1)
6666
let it (T.Iter c q) = (c,q)
6767
return $ l === 0 .||. it (T.iter t i) === it (BT.iter buf i)
6868

6969
t_iter_ :: BPT -> Gen Property
7070
t_iter_ (BP _ts t buf) = do
71-
let l = T.lengthWord16 t
71+
let l = BT.lengthCodeUnits t
7272
i <- choose (0,l-1)
7373
return $ l === 0 .||. T.iter_ t i === BT.iter_ buf i
7474

@@ -77,10 +77,16 @@ b_unsafeDrop (BP _ts t buf) = do
7777
i <- choose (0, B.length t)
7878
return $ B.unsafeDrop i t === BB.unsafeDrop i buf
7979

80-
t_dropWord16 :: BPT -> Gen Property
81-
t_dropWord16 (BP _ts t buf) = do
82-
i <- choose (0, T.lengthWord16 t)
83-
return $ T.dropWord16 i t === BT.dropWord16 i buf
80+
t_dropCodeUnits :: BPT -> Gen Property
81+
t_dropCodeUnits (BP _ts t buf) = do
82+
i <- choose (0, BT.lengthCodeUnits t)
83+
return $ dropCodeUnits i t === BT.dropCodeUnits i buf
84+
where
85+
#if MIN_VERSION_text(2,0,0)
86+
dropCodeUnits = T.dropWord8
87+
#else
88+
dropCodeUnits = T.dropWord16
89+
#endif
8490

8591
tests :: [TestTree]
8692
tests = [
@@ -92,5 +98,5 @@ tests = [
9298
, testProperty "t_iter" t_iter
9399
, testProperty "t_iter_" t_iter_
94100
, testProperty "b_unsafeDrop" b_unsafeDrop
95-
, testProperty "t_dropWord16" t_dropWord16
101+
, testProperty "t_dropCodeUnits" t_dropCodeUnits
96102
]

0 commit comments

Comments
 (0)