1- -- This algorithm is from Regular Expression Sub-Matching using Partial Derivatives - Martin Sulzmann and Kenny Zhuo Ming Lu
1+ -- Based on algorithm from Regular Expression Sub-Matching using Partial Derivatives - Martin Sulzmann and Kenny Zhuo Ming Lu
2+ -- Thank you Keegan Perry for simplifying and explaining the original to me.
23
3- -- Thank you Keegan Perry for simplifying and explaining this to me.
44import Validator.Learning.Cegex.Cegex
55
66namespace CegexCapture
@@ -12,19 +12,21 @@ def neutralize (x: Cegex): Cegex :=
1212 match x with
1313 | Cegex.emptyset => Cegex.emptyset
1414 | Cegex.epsilon => Cegex.epsilon
15+ | Cegex.matched c => Cegex.matched c
1516 | Cegex.char _ => Cegex.emptyset
1617 | Cegex.or y z => Cegex.smartOr (neutralize y) (neutralize z)
1718 | Cegex.concat y z => Cegex.concat (neutralize y) (neutralize z)
1819 | Cegex.star y => Cegex.star (neutralize y)
19- | Cegex.group id c y => Cegex.group id c (neutralize y)
20+ | Cegex.group id y => Cegex.group id (neutralize y)
2021
2122partial def derive (x: Cegex) (char: Char): Cegex :=
2223 match x with
2324 | Cegex.emptyset => Cegex.emptyset
2425 | Cegex.epsilon => Cegex.emptyset
26+ | Cegex.matched _ => Cegex.emptyset
2527 | Cegex.char char' =>
2628 if char' = char
27- then Cegex.epsilon
29+ then Cegex.matched char
2830 else Cegex.emptyset
2931 | Cegex.or y z => Cegex.smartOr (derive y char) (derive z char)
3032 | Cegex.concat y z =>
@@ -38,16 +40,17 @@ partial def derive (x: Cegex) (char: Char): Cegex :=
3840 | Cegex.star y => Cegex.smartConcat (derive y char) x
3941 -- group is the new operator compared to Expr.
4042 -- We store the input tree in the expression.
41- | Cegex.group n chars y =>
42- Cegex.group n (char :: chars) ( derive y char)
43+ | Cegex.group n y =>
44+ Cegex.group n (derive y char)
4345
44- def extract (x: Cegex): List (Nat × List Char) :=
46+ def extract (x: Cegex): List Char :=
4547 match x with
4648 -- should never be encountered, since emptyset is not nullable.
4749 | Cegex.emptyset => []
4850 | Cegex.epsilon => []
4951 -- should never be encountered, since char is not nullable.
5052 | Cegex.char _ => []
53+ | Cegex.matched c => [c]
5154 | Cegex.or y z =>
5255 -- Under POSIX semantics, we prefer matching the left alternative.
5356 if y.nullable
@@ -58,12 +61,34 @@ def extract (x: Cegex): List (Nat × List Char) :=
5861 -- Groups under a star are ignored.
5962 -- Recursively extracting under the star causes empty captures to be reported, which we do not want under POSIX semantics.
6063 | Cegex.star _ => []
61- | Cegex.group id c y => (id, c) :: extract y
64+ -- ignore group, this group will be extracted later by extractGroups.
65+ | Cegex.group _ y => extract y
66+
67+ def extractGroups (x: Cegex): List (Nat × List Char) :=
68+ match x with
69+ -- should never be encountered, since emptyset is not nullable.
70+ | Cegex.emptyset => []
71+ | Cegex.epsilon => []
72+ -- should never be encountered, since char is not nullable.
73+ | Cegex.char _ => []
74+ | Cegex.matched _ => []
75+ | Cegex.or y z =>
76+ -- Under POSIX semantics, we prefer matching the left alternative.
77+ if y.nullable
78+ then extractGroups y
79+ else extractGroups z
80+ | Cegex.concat y z =>
81+ extractGroups y ++ extractGroups z
82+ -- Groups under a star are ignored.
83+ -- Recursively extracting under the star causes empty captures to be reported, which we do not want under POSIX semantics.
84+ | Cegex.star _ => []
85+ -- extract the string
86+ | Cegex.group id y => (id, extract y) :: extractGroups y
6287
6388def captures (x: Cegex) (chars: List Char): Option (List (Nat × List Char)) :=
6489 let dx := List.foldl derive x chars
6590 if dx.nullable
66- then Option.some (extract dx)
91+ then Option.some (extractGroups dx)
6792 else Option.none
6893
6994def capture (name: Nat) (x: Cegex) (input: String): Option String :=
@@ -82,23 +107,23 @@ def capture (name: Nat) (x: Cegex) (input: String): Option String :=
82107
83108#guard capture 1 (Cegex.concat (Cegex.concat
84109 (Cegex.star (Cegex.char 'a' ))
85- (Cegex.group 1 [] (Cegex.char 'b' )))
110+ (Cegex.group 1 (Cegex.char 'b' )))
86111 (Cegex.star (Cegex.char 'a' ))
87112 )
88113 "aaabaaa" =
89114 Option.some "b"
90115
91116#guard capture 1 (Cegex.concat (Cegex.concat
92117 (Cegex.star (Cegex.char 'a' ))
93- (Cegex.group 1 [] (Cegex.star (Cegex.char 'b' ))))
118+ (Cegex.group 1 (Cegex.star (Cegex.char 'b' ))))
94119 (Cegex.star (Cegex.char 'a' ))
95120 )
96121 "aaabbbaaa" =
97122 Option.some "bbb"
98123
99124#guard capture 1 (Cegex.concat (Cegex.concat
100125 (Cegex.star (Cegex.char 'a' ))
101- (Cegex.group 1 []
126+ (Cegex.group 1
102127 (Cegex.or
103128 (Cegex.star (Cegex.char 'b' ))
104129 (Cegex.star (Cegex.char 'c' ))
@@ -108,3 +133,16 @@ def capture (name: Nat) (x: Cegex) (input: String): Option String :=
108133 )
109134 "aaacccaaa" =
110135 Option.some "ccc"
136+
137+ #guard capture 1 (Cegex.concat (Cegex.concat
138+ (Cegex.star (Cegex.char 'a' ))
139+ (Cegex.group 1
140+ (Cegex.or
141+ (Cegex.star (Cegex.char 'b' ))
142+ (Cegex.concat (Cegex.char 'b' ) (Cegex.star (Cegex.char 'c' )))
143+ )
144+ ))
145+ (Cegex.star (Cegex.char 'a' ))
146+ )
147+ "aaabccaaa" =
148+ Option.some "bcc"
0 commit comments