@@ -8,9 +8,9 @@ import Validator.Learning.Tegex.Tegex
88
99namespace TegexCapture
1010
11- -- neutralize replaces all chars with emptyset.
12- -- This way the expression will stay nullable and not change based on derivative input .
13- -- This makes it possible to keep all the capture groups inside for later extraction .
11+ -- neutralize replaces all tree operators with emptyset.
12+ -- It is used when deriving concat .
13+ -- This way we do not lose capture information on nullable expressions .
1414def neutralize (x: Tegex): Tegex :=
1515 match x with
1616 | Tegex.emptyset => Tegex.emptyset
@@ -26,6 +26,7 @@ partial def derive (x: Tegex) (tree: ParseTree): Tegex :=
2626 match x with
2727 | Tegex.emptyset => Tegex.emptyset
2828 | Tegex.epsilon => Tegex.emptyset
29+ -- remember matched is just epsilon, so has the same derivative.
2930 | Tegex.matched _ _ => Tegex.emptyset
3031 | Tegex.tree tok' childExpr =>
3132 match tree with
@@ -43,12 +44,11 @@ partial def derive (x: Tegex) (tree: ParseTree): Tegex :=
4344 then Tegex.smartOr
4445 (Tegex.smartConcat (derive y tree) z)
4546 -- A difference from the usual derive algorithm:
46- -- Instead of (derive z tree), we write:
47+ -- To preserve the capture information in the nullable expression y,
48+ -- instead of (derive z tree), we write:
4749 (Tegex.smartConcat (neutralize y) (derive z tree))
4850 else Tegex.concat (derive y tree) z
4951 | Tegex.star y => Tegex.smartConcat (derive y tree) x
50- -- group is the new operator compared to Expr.
51- -- We store the input tree in the expression.
5252 | Tegex.group n y =>
5353 Tegex.group n (derive y tree)
5454
@@ -57,7 +57,7 @@ def extract (x: Tegex): List ParseTree :=
5757 -- should never be encountered, since emptyset is not nullable.
5858 | Tegex.emptyset => []
5959 | Tegex.epsilon => []
60- -- should never be encountered, since char is not nullable.
60+ -- should never be encountered, since tree is not nullable.
6161 | Tegex.tree _ _ => []
6262 | Tegex.matched tok childExpr => [ParseTree.mk tok (extract childExpr)]
6363 | Tegex.or y z =>
@@ -70,7 +70,7 @@ def extract (x: Tegex): List ParseTree :=
7070 -- Groups under a star are ignored.
7171 -- Recursively extracting under the star causes empty captures to be reported, which we do not want under POSIX semantics.
7272 | Tegex.star _ => []
73- -- ignore group, this group will be extracted later by extractGroups.
73+ -- Ignore group, this group will be extracted later by extractGroups.
7474 | Tegex.group _ y => extract y
7575
7676def extractGroups (x: Tegex): List (Nat × List ParseTree) :=
@@ -80,6 +80,7 @@ def extractGroups (x: Tegex): List (Nat × List ParseTree) :=
8080 | Tegex.epsilon => []
8181 -- should never be encountered, since tree is not nullable.
8282 | Tegex.tree _ _ => []
83+ -- There may be groups in the childExpr that needs to be extracted.
8384 | Tegex.matched _ childExpr => extractGroups childExpr
8485 | Tegex.or y z =>
8586 -- Under POSIX semantics, we prefer matching the left alternative.
@@ -91,7 +92,7 @@ def extractGroups (x: Tegex): List (Nat × List ParseTree) :=
9192 -- Groups under a star are ignored.
9293 -- Recursively extracting under the star causes empty captures to be reported, which we do not want under POSIX semantics.
9394 | Tegex.star _ => []
94- -- extract the string
95+ -- extract the forest for the single group id
9596 | Tegex.group id y => (id, extract y) :: extractGroups y
9697
9798def captures (x: Tegex) (forest: List ParseTree): Option (List (Nat × List ParseTree)) :=
0 commit comments