Skip to content

Commit 82156ea

Browse files
add copy for Regex implementation for experimentation
1 parent 361cb68 commit 82156ea

File tree

4 files changed

+174
-6
lines changed

4 files changed

+174
-6
lines changed

Validator/Learning.lean

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,6 @@ import Validator.Learning.Parser
99

1010
import Validator.Learning.Regex.Regex
1111
import Validator.Learning.Regex.RegexCapture
12+
13+
import Validator.Learning.Cegex.Cegex
14+
import Validator.Learning.Cegex.CegexCapture
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
-- Cegex includes capturing groups
2+
inductive Cegex where
3+
| emptyset
4+
| epsilon
5+
| char (c: Char)
6+
| or (y z: Cegex)
7+
| concat (y z: Cegex)
8+
| star (y: Cegex)
9+
-- group is the only new operator compared to a Cegex without capturing groups.
10+
| group (id: Nat) (capture: List Char) (x: Cegex)
11+
deriving DecidableEq, Ord, Repr, Hashable
12+
13+
def Cegex.nullable (x: Cegex): Bool :=
14+
match x with
15+
| Cegex.emptyset => false
16+
| Cegex.epsilon => true
17+
| Cegex.char _ => false
18+
| Cegex.or y z => nullable y || nullable z
19+
| Cegex.concat y z => nullable y && nullable z
20+
| Cegex.star _ => true
21+
-- The group is nullable if its embedded expression is nullable.
22+
| Cegex.group _ _ y => nullable y
23+
24+
def Cegex.unescapable (x: Cegex): Bool :=
25+
match x with
26+
| Cegex.emptyset => true
27+
| _ => false
28+
29+
-- smartOr is a smart constructor for the or operator.
30+
def Cegex.smartOr (x y: Cegex): Cegex :=
31+
match x with
32+
| Cegex.emptyset => y
33+
| _ =>
34+
match y with
35+
| Cegex.emptyset => x
36+
| _ => Cegex.or x y
37+
38+
-- smartConcat is a smart constructor for the concat operator.
39+
def Cegex.smartConcat (x y: Cegex): Cegex :=
40+
match x with
41+
| Cegex.emptyset => Cegex.emptyset
42+
| _ =>
43+
match y with
44+
| Cegex.emptyset => Cegex.emptyset
45+
| _ => Cegex.concat x y
46+
47+
-- smartStar is a smart constructor for the star operator.
48+
def Cegex.smartStar (x: Cegex): Cegex :=
49+
match x with
50+
| Cegex.star _ => x
51+
| _ => Cegex.star x
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
-- This algorithm is from Regular Expression Sub-Matching using Partial Derivatives - Martin Sulzmann and Kenny Zhuo Ming Lu
2+
3+
-- Thank you Keegan Perry for simplifying and explaining this to me.
4+
import Validator.Learning.Cegex.Cegex
5+
6+
namespace CegexCapture
7+
8+
-- neutralize replaces all chars with emptyset.
9+
-- This way the expression will stay nullable and not change based on derivative input.
10+
-- This makes it possible to keep all the capture groups inside for later extraction.
11+
def neutralize (x: Cegex): Cegex :=
12+
match x with
13+
| Cegex.emptyset => Cegex.emptyset
14+
| Cegex.epsilon => Cegex.epsilon
15+
| Cegex.char _ => Cegex.emptyset
16+
| Cegex.or y z => Cegex.smartOr (neutralize y) (neutralize z)
17+
| Cegex.concat y z => Cegex.concat (neutralize y) (neutralize z)
18+
| Cegex.star y => Cegex.star (neutralize y)
19+
| Cegex.group id c y => Cegex.group id c (neutralize y)
20+
21+
partial def derive (x: Cegex) (char: Char): Cegex :=
22+
match x with
23+
| Cegex.emptyset => Cegex.emptyset
24+
| Cegex.epsilon => Cegex.emptyset
25+
| Cegex.char char' =>
26+
if char' = char
27+
then Cegex.epsilon
28+
else Cegex.emptyset
29+
| Cegex.or y z => Cegex.smartOr (derive y char) (derive z char)
30+
| Cegex.concat y z =>
31+
if Cegex.nullable y
32+
then Cegex.smartOr
33+
(Cegex.smartConcat (derive y char) z)
34+
-- A difference from the usual derive algorithm:
35+
-- Instead of (derive z tree), we write:
36+
(Cegex.smartConcat (neutralize y) (derive z char))
37+
else Cegex.concat (derive y char) z
38+
| Cegex.star y => Cegex.smartConcat (derive y char) x
39+
-- group is the new operator compared to Expr.
40+
-- We store the input tree in the expression.
41+
| Cegex.group n chars y =>
42+
Cegex.group n (char :: chars) (derive y char)
43+
44+
def extract (x: Cegex): List (Nat × List Char) :=
45+
match x with
46+
-- should never be encountered, since emptyset is not nullable.
47+
| Cegex.emptyset => []
48+
| Cegex.epsilon => []
49+
-- should never be encountered, since char is not nullable.
50+
| Cegex.char _ => []
51+
| Cegex.or y z =>
52+
-- Under POSIX semantics, we prefer matching the left alternative.
53+
if y.nullable
54+
then extract y
55+
else extract z
56+
| Cegex.concat y z =>
57+
extract y ++ extract z
58+
-- Groups under a star are ignored.
59+
-- Recursively extracting under the star causes empty captures to be reported, which we do not want under POSIX semantics.
60+
| Cegex.star _ => []
61+
| Cegex.group id c y => (id, c) :: extract y
62+
63+
def captures (x: Cegex) (chars: List Char): Option (List (Nat × List Char)) :=
64+
let dx := List.foldl derive x chars
65+
if dx.nullable
66+
then Option.some (extract dx)
67+
else Option.none
68+
69+
def capture (name: Nat) (x: Cegex) (input: String): Option String :=
70+
match input with
71+
| String.mk chars =>
72+
match captures x chars with
73+
| Option.none => Option.none
74+
| Option.some cs =>
75+
let strs := List.filterMap
76+
(fun (name', str) =>
77+
if name = name'
78+
then Option.some (String.mk str)
79+
else Option.none
80+
) cs
81+
List.head? (List.reverse (List.mergeSort strs))
82+
83+
#guard capture 1 (Cegex.concat (Cegex.concat
84+
(Cegex.star (Cegex.char 'a'))
85+
(Cegex.group 1 [] (Cegex.char 'b')))
86+
(Cegex.star (Cegex.char 'a'))
87+
)
88+
"aaabaaa" =
89+
Option.some "b"
90+
91+
#guard capture 1 (Cegex.concat (Cegex.concat
92+
(Cegex.star (Cegex.char 'a'))
93+
(Cegex.group 1 [] (Cegex.star (Cegex.char 'b'))))
94+
(Cegex.star (Cegex.char 'a'))
95+
)
96+
"aaabbbaaa" =
97+
Option.some "bbb"
98+
99+
#guard capture 1 (Cegex.concat (Cegex.concat
100+
(Cegex.star (Cegex.char 'a'))
101+
(Cegex.group 1 []
102+
(Cegex.or
103+
(Cegex.star (Cegex.char 'b'))
104+
(Cegex.star (Cegex.char 'c'))
105+
)
106+
))
107+
(Cegex.star (Cegex.char 'a'))
108+
)
109+
"aaacccaaa" =
110+
Option.some "ccc"

Validator/Learning/Regex/RegexCapture.lean

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,22 @@
11
-- This algorithm is from Regular Expression Sub-Matching using Partial Derivatives - Martin Sulzmann and Kenny Zhuo Ming Lu
22

3+
-- Thank you Keegan Perry for simplifying and explaining this to me.
34
import Validator.Learning.Regex.Regex
45

56
namespace RegexCapture
67

7-
def charToEmptySet (x: Regex): Regex :=
8+
-- neutralize replaces all chars with emptyset.
9+
-- This way the expression will stay nullable and not change based on derivative input.
10+
-- This makes it possible to keep all the capture groups inside for later extraction.
11+
def neutralize (x: Regex): Regex :=
812
match x with
913
| Regex.emptyset => Regex.emptyset
1014
| Regex.epsilon => Regex.epsilon
1115
| Regex.char _ => Regex.emptyset
12-
| Regex.or y z => Regex.smartOr (charToEmptySet y) (charToEmptySet z)
13-
| Regex.concat y z => Regex.concat (charToEmptySet y) (charToEmptySet z)
14-
| Regex.star y => Regex.star (charToEmptySet y)
15-
| Regex.group id c y => Regex.group id c (charToEmptySet y)
16+
| Regex.or y z => Regex.smartOr (neutralize y) (neutralize z)
17+
| Regex.concat y z => Regex.concat (neutralize y) (neutralize z)
18+
| Regex.star y => Regex.star (neutralize y)
19+
| Regex.group id c y => Regex.group id c (neutralize y)
1620

1721
partial def derive (x: Regex) (char: Char): Regex :=
1822
match x with
@@ -29,7 +33,7 @@ partial def derive (x: Regex) (char: Char): Regex :=
2933
(Regex.smartConcat (derive y char) z)
3034
-- A difference from the usual derive algorithm:
3135
-- Instead of (derive z tree), we write:
32-
(Regex.smartConcat (charToEmptySet y) (derive z char))
36+
(Regex.smartConcat (neutralize y) (derive z char))
3337
else Regex.concat (derive y char) z
3438
| Regex.star y => Regex.smartConcat (derive y char) x
3539
-- group is the new operator compared to Expr.

0 commit comments

Comments
 (0)