Skip to content

Commit 833a1df

Browse files
update Cegex to using matched operator for capturing instead of group operator
1 parent 82156ea commit 833a1df

File tree

3 files changed

+68
-15
lines changed

3 files changed

+68
-15
lines changed

Validator/Learning/Cegex/Cegex.lean

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,26 @@
22
inductive Cegex where
33
| emptyset
44
| epsilon
5+
| matched (c: Char)
56
| char (c: Char)
67
| or (y z: Cegex)
78
| concat (y z: Cegex)
89
| star (y: Cegex)
910
-- group is the only new operator compared to a Cegex without capturing groups.
10-
| group (id: Nat) (capture: List Char) (x: Cegex)
11+
| group (id: Nat) (x: Cegex)
1112
deriving DecidableEq, Ord, Repr, Hashable
1213

1314
def Cegex.nullable (x: Cegex): Bool :=
1415
match x with
1516
| Cegex.emptyset => false
1617
| Cegex.epsilon => true
18+
| Cegex.matched _ => true
1719
| Cegex.char _ => false
1820
| Cegex.or y z => nullable y || nullable z
1921
| Cegex.concat y z => nullable y && nullable z
2022
| Cegex.star _ => true
2123
-- The group is nullable if its embedded expression is nullable.
22-
| Cegex.group _ _ y => nullable y
24+
| Cegex.group _ y => nullable y
2325

2426
def Cegex.unescapable (x: Cegex): Bool :=
2527
match x with

Validator/Learning/Cegex/CegexCapture.lean

Lines changed: 50 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
-- This algorithm is from Regular Expression Sub-Matching using Partial Derivatives - Martin Sulzmann and Kenny Zhuo Ming Lu
1+
-- Based on algorithm from Regular Expression Sub-Matching using Partial Derivatives - Martin Sulzmann and Kenny Zhuo Ming Lu
2+
-- Thank you Keegan Perry for simplifying and explaining the original to me.
23

3-
-- Thank you Keegan Perry for simplifying and explaining this to me.
44
import Validator.Learning.Cegex.Cegex
55

66
namespace CegexCapture
@@ -12,19 +12,21 @@ def neutralize (x: Cegex): Cegex :=
1212
match x with
1313
| Cegex.emptyset => Cegex.emptyset
1414
| Cegex.epsilon => Cegex.epsilon
15+
| Cegex.matched c => Cegex.matched c
1516
| Cegex.char _ => Cegex.emptyset
1617
| Cegex.or y z => Cegex.smartOr (neutralize y) (neutralize z)
1718
| Cegex.concat y z => Cegex.concat (neutralize y) (neutralize z)
1819
| Cegex.star y => Cegex.star (neutralize y)
19-
| Cegex.group id c y => Cegex.group id c (neutralize y)
20+
| Cegex.group id y => Cegex.group id (neutralize y)
2021

2122
partial def derive (x: Cegex) (char: Char): Cegex :=
2223
match x with
2324
| Cegex.emptyset => Cegex.emptyset
2425
| Cegex.epsilon => Cegex.emptyset
26+
| Cegex.matched _ => Cegex.emptyset
2527
| Cegex.char char' =>
2628
if char' = char
27-
then Cegex.epsilon
29+
then Cegex.matched char
2830
else Cegex.emptyset
2931
| Cegex.or y z => Cegex.smartOr (derive y char) (derive z char)
3032
| Cegex.concat y z =>
@@ -38,16 +40,17 @@ partial def derive (x: Cegex) (char: Char): Cegex :=
3840
| Cegex.star y => Cegex.smartConcat (derive y char) x
3941
-- group is the new operator compared to Expr.
4042
-- We store the input tree in the expression.
41-
| Cegex.group n chars y =>
42-
Cegex.group n (char :: chars) (derive y char)
43+
| Cegex.group n y =>
44+
Cegex.group n (derive y char)
4345

44-
def extract (x: Cegex): List (Nat × List Char) :=
46+
def extract (x: Cegex): List Char :=
4547
match x with
4648
-- should never be encountered, since emptyset is not nullable.
4749
| Cegex.emptyset => []
4850
| Cegex.epsilon => []
4951
-- should never be encountered, since char is not nullable.
5052
| Cegex.char _ => []
53+
| Cegex.matched c => [c]
5154
| Cegex.or y z =>
5255
-- Under POSIX semantics, we prefer matching the left alternative.
5356
if y.nullable
@@ -58,12 +61,34 @@ def extract (x: Cegex): List (Nat × List Char) :=
5861
-- Groups under a star are ignored.
5962
-- Recursively extracting under the star causes empty captures to be reported, which we do not want under POSIX semantics.
6063
| Cegex.star _ => []
61-
| Cegex.group id c y => (id, c) :: extract y
64+
-- ignore group, this group will be extracted later by extractGroups.
65+
| Cegex.group _ y => extract y
66+
67+
def extractGroups (x: Cegex): List (Nat × List Char) :=
68+
match x with
69+
-- should never be encountered, since emptyset is not nullable.
70+
| Cegex.emptyset => []
71+
| Cegex.epsilon => []
72+
-- should never be encountered, since char is not nullable.
73+
| Cegex.char _ => []
74+
| Cegex.matched _ => []
75+
| Cegex.or y z =>
76+
-- Under POSIX semantics, we prefer matching the left alternative.
77+
if y.nullable
78+
then extractGroups y
79+
else extractGroups z
80+
| Cegex.concat y z =>
81+
extractGroups y ++ extractGroups z
82+
-- Groups under a star are ignored.
83+
-- Recursively extracting under the star causes empty captures to be reported, which we do not want under POSIX semantics.
84+
| Cegex.star _ => []
85+
-- extract the string
86+
| Cegex.group id y => (id, extract y) :: extractGroups y
6287

6388
def captures (x: Cegex) (chars: List Char): Option (List (Nat × List Char)) :=
6489
let dx := List.foldl derive x chars
6590
if dx.nullable
66-
then Option.some (extract dx)
91+
then Option.some (extractGroups dx)
6792
else Option.none
6893

6994
def capture (name: Nat) (x: Cegex) (input: String): Option String :=
@@ -82,23 +107,23 @@ def capture (name: Nat) (x: Cegex) (input: String): Option String :=
82107

83108
#guard capture 1 (Cegex.concat (Cegex.concat
84109
(Cegex.star (Cegex.char 'a'))
85-
(Cegex.group 1 [] (Cegex.char 'b')))
110+
(Cegex.group 1 (Cegex.char 'b')))
86111
(Cegex.star (Cegex.char 'a'))
87112
)
88113
"aaabaaa" =
89114
Option.some "b"
90115

91116
#guard capture 1 (Cegex.concat (Cegex.concat
92117
(Cegex.star (Cegex.char 'a'))
93-
(Cegex.group 1 [] (Cegex.star (Cegex.char 'b'))))
118+
(Cegex.group 1 (Cegex.star (Cegex.char 'b'))))
94119
(Cegex.star (Cegex.char 'a'))
95120
)
96121
"aaabbbaaa" =
97122
Option.some "bbb"
98123

99124
#guard capture 1 (Cegex.concat (Cegex.concat
100125
(Cegex.star (Cegex.char 'a'))
101-
(Cegex.group 1 []
126+
(Cegex.group 1
102127
(Cegex.or
103128
(Cegex.star (Cegex.char 'b'))
104129
(Cegex.star (Cegex.char 'c'))
@@ -108,3 +133,16 @@ def capture (name: Nat) (x: Cegex) (input: String): Option String :=
108133
)
109134
"aaacccaaa" =
110135
Option.some "ccc"
136+
137+
#guard capture 1 (Cegex.concat (Cegex.concat
138+
(Cegex.star (Cegex.char 'a'))
139+
(Cegex.group 1
140+
(Cegex.or
141+
(Cegex.star (Cegex.char 'b'))
142+
(Cegex.concat (Cegex.char 'b') (Cegex.star (Cegex.char 'c')))
143+
)
144+
))
145+
(Cegex.star (Cegex.char 'a'))
146+
)
147+
"aaabccaaa" =
148+
Option.some "bcc"

Validator/Learning/Regex/RegexCapture.lean

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ partial def derive (x: Regex) (char: Char): Regex :=
3939
-- group is the new operator compared to Expr.
4040
-- We store the input tree in the expression.
4141
| Regex.group n chars y =>
42-
Regex.group n (char :: chars) (derive y char)
42+
Regex.group n (chars ++ [char]) (derive y char)
4343

4444
def extract (x: Regex): List (Nat × List Char) :=
4545
match x with
@@ -108,3 +108,16 @@ def capture (name: Nat) (x: Regex) (input: String): Option String :=
108108
)
109109
"aaacccaaa" =
110110
Option.some "ccc"
111+
112+
#guard capture 1 (Regex.concat (Regex.concat
113+
(Regex.star (Regex.char 'a'))
114+
(Regex.group 1 []
115+
(Regex.or
116+
(Regex.star (Regex.char 'b'))
117+
(Regex.concat (Regex.char 'b') (Regex.star (Regex.char 'c')))
118+
)
119+
))
120+
(Regex.star (Regex.char 'a'))
121+
)
122+
"aaabccaaa" =
123+
Option.some "bcc"

0 commit comments

Comments
 (0)