1
+ #lang racket/base
2
+
3
+ (require racket/match
4
+ racket/contract
5
+ (only-in racket/list add-between))
6
+
7
+ (provide
8
+ (contract-out [parse-content-disposition-header
9
+ (-> bytes?
10
+ (or/c
11
+ (list/c 'parsefail string?)
12
+ (list/c bytes?
13
+ (listof (list/c bytes? bytes?)))))]))
14
+
15
+ (struct parsefail exn ())
16
+
17
+ ;; this file parses the Content-Disposition line of HTTP headers
18
+
19
+ ;; from RFC6266:
20
+ #|content-disposition = "Content-Disposition" ":"
21
+ disposition-type *( ";" disposition-parm )
22
+
23
+ disposition-type = "inline" | "attachment" | disp-ext-type
24
+ ; case-insensitive
25
+ disp-ext-type = token
26
+
27
+ disposition-parm = filename-parm | disp-ext-parm
28
+
29
+ filename-parm = "filename" "=" value
30
+ | "filename*" "=" ext-value
31
+
32
+ disp-ext-parm = token "=" value
33
+ | ext-token "=" ext-value
34
+ ext-token = <the characters in token, followed by "*">
35
+ |#
36
+
37
+ ;; HOWEVER: we're explicitly giving up on RFC5987-style ext-values. If
38
+ ;; we see a parm or filename-parm whose token ends with a star, we
39
+ ;; just give up.
40
+
41
+ ;; SUPER-LIGHTWEIGHT PARSER FRAMEWORK:
42
+
43
+ ;; this is about the most lightweight parser framework that I could
44
+ ;; come up with:
45
+
46
+ ;; a parser returns a list containing a parsed value and a byte string
47
+ ;; containing the remainder, or it returns false. Only one parsing is
48
+ ;; possible with this scheme. Also, this parser does not support
49
+ ;; backtracking; the kleene star insists on eating the rest of the
50
+ ;; input. This should ensure that every parser created with this framework
51
+ ;; is nice and fast.
52
+
53
+ ;; it would be fun to convert this to TR, and I think it would work fine.
54
+ ;; I don't have time to do it right now... :(
55
+
56
+ ;; given a bunch of parsers, use each of them and combine
57
+ ;; their parsed values in a list. This is basically just the I/O
58
+ ;; monad. Or, to be more specific, just the "I" monad. Er, with
59
+ ;; the exception monad mixed in. Kinda.
60
+ (define (seq . parsers)
61
+ (cond [(null? parsers) (λ (bstr) (list '() bstr))]
62
+ [else (λ (bstr)
63
+ (match ((car parsers) bstr)
64
+ [(list yay leftover)
65
+ ((postproc (apply seq (cdr parsers))
66
+ (λ (restyay) (cons yay restyay)))
67
+ leftover)]
68
+ [#f #f ]))]))
69
+
70
+ ;; given two parsers, use the first one that succeeds. No backtracking.
71
+ (define (orparse p1 p2)
72
+ (λ (bstr)
73
+ (match (p1 bstr)
74
+ [(list yay leftover) (list yay leftover)]
75
+ [#f (p2 bstr)])))
76
+
77
+
78
+ ;; Kleene star: given a parser, parse until you can't parse any more
79
+ (define (kstar parser)
80
+ (λ (bstr)
81
+ (match (parser bstr)
82
+ [(list yay leftover)
83
+ ((postproc (kstar parser)
84
+ (λ (v) (cons yay v)))
85
+ leftover)]
86
+ [#f (list '() bstr)])))
87
+
88
+
89
+ ;; given a regexp (good idea for it to start with ^),
90
+ ;; and a function to apply to the matched bytes before
91
+ ;; returning, return a parser for that regexp
92
+ (define (rx-matcher regexp postproc)
93
+ (λ (bstr)
94
+ (define maybe-matches (regexp-match-positions regexp bstr))
95
+ (match maybe-matches
96
+ [(list (cons 0 end) other ... )
97
+ (list (postproc (subbytes bstr 0 end))
98
+ (subbytes bstr end))]
99
+ [other #f ])))
100
+
101
+
102
+ ;; given a regexp (good idea for it to start with ^),
103
+ ;; return a parser for that regexp
104
+ (define (rx-matcher/raw regexp)
105
+ (rx-matcher regexp (λ (x) x)))
106
+
107
+ ;; given a regexp (good idea for it to start with ^),
108
+ ;; and a constant, return an rx parser that just returns
109
+ ;; the contstant (if it matches)
110
+ (define (rx-matcher/const regexp const)
111
+ (rx-matcher regexp (λ (_ ) const)))
112
+
113
+ ;; given a parser, return a new parser that strips 'v' from the list
114
+ ;; in the result position (if the whole parse result is #f, just
115
+ ;; return it). Doesn't recur into sublists.
116
+ (define (strip v p)
117
+ (postproc p (λ (l) (filter (λ (elt) (not (equal? elt v))) l))))
118
+
119
+ ;; apply the given 'pp-fun' to the value in the result position
120
+ ;; of the parser. If the parser fails, just return the fail
121
+ (define (postproc parser pp-fun)
122
+ (λ (bstr)
123
+ (match (parser bstr)
124
+ [(list result leftover)
125
+ (list (pp-fun result) leftover)]
126
+ [#f #f ])))
127
+
128
+ ;; try to use parser p. if it fails, pretend it succeeded, and use
129
+ ;; the given value as the result
130
+ (define (opt p val)
131
+ (orparse p (λ (bstr) (list val bstr))))
132
+
133
+ ;; linear white space
134
+ ;; NB: it looks like the request parser actually cleans up line breaks
135
+ ;; for us... no problem.
136
+ (define LWS (rx-matcher/const #px#"^(\r\n)?[ \t]+ " 'LWS ))
137
+ ;; optional linear white space
138
+ (define OPTLWS (opt LWS 'LWS ))
139
+ ;; optional leading whitespace
140
+ ;; IMPL: can't just staple OPTLWS on the front of a seq, because
141
+ ;; then on #""i n a kstar it gets partway through (viz, the optlws) and
142
+ ;; then thinks it's failed partway through a seq. Grr.
143
+ (define (leadingLWS parser)
144
+ (orparse (postproc (seq LWS parser) cadr) parser))
145
+
146
+ ;; a sequence where linear whitespace is allowed (and discarded)
147
+ ;; before and between every pair of elements
148
+ (define (seq/ws . parsers)
149
+ (strip 'LWS (leadingLWS (apply seq (add-between parsers OPTLWS)))))
150
+
151
+
152
+ (define SEMI (rx-matcher/const #px#"^; " 'SEMI ))
153
+ (define EQ (rx-matcher/const #px#"^= " 'EQ ))
154
+ (define DQ (rx-matcher/const #px#"^\" " 'DQ ))
155
+
156
+ ;; a quoted string. a quote followed by any character from 32-255 not
157
+ ;; including backslash or quote, but optionally a backslash followed
158
+ ;; by any char (can only be 0-127), and finally a close quote.
159
+ ;; IMPL NOTE: you can do all of this with a single regexp, but you
160
+ ;; wind up doing all the same work over again in cleaning up the
161
+ ;; string.
162
+ (define CLEANCHARSEQ (rx-matcher/raw #px#"^([ -!#-[]|[\\]-\377])+ " ))
163
+ (define QDESCAPED (rx-matcher #px#"^\\\\[\0-\177] "
164
+ (λ (v) (list 'escaped v))))
165
+ (define QTDSTR
166
+ (postproc (seq DQ (kstar (orparse CLEANCHARSEQ QDESCAPED)) DQ)
167
+ (λ (v) (list 'quoted (cadr v)))))
168
+ (define TOKEN (rx-matcher/raw #px#"^([-!#-'*-+.0-9A-Z^-z|~]+) " ))
169
+ (define VALUE (orparse TOKEN QTDSTR))
170
+
171
+ ;; give up if we see a token ending with a star; these signal
172
+ ;; RFC5987 ext-values, and we don't handle them correctly.
173
+ (define CLAUSE
174
+ (postproc
175
+ (seq/ws TOKEN EQ VALUE)
176
+ (λ (v)
177
+ (when (regexp-match #px#"\\*$ " (car v))
178
+ (raise
179
+ (parsefail
180
+ (format "token ending with * indicates unsupported ext-value: ~e "
181
+ (car v))
182
+ (current-continuation-marks))))
183
+ v)))
184
+
185
+ (define content-disposition-parser
186
+ (seq/ws TOKEN (kstar (seq/ws SEMI CLAUSE))))
187
+
188
+
189
+ ;; given the right-hand-side of a content-disposition header
190
+ ;; line, return a list containing the content-disposition-type
191
+ ;; and a list of token/value lists
192
+ (define (parse-content-disposition-header rhs)
193
+ (with-handlers ([parsefail?
194
+ (lambda (pf)
195
+ (list 'parsefail (exn-message pf)))])
196
+ (match (content-disposition-parser rhs)
197
+ [(list matched #"" )
198
+ (match matched
199
+ [(list ty clauses)
200
+ (list ty (for/list ([c (in-list clauses)])
201
+ (match c
202
+ [(list 'SEMI (list tok 'EQ val))
203
+ (list tok (val-cleanup val))]
204
+ [other (error
205
+ 'parse-content-disposition-header
206
+ "internal error, unexpected parse shape: ~e "
207
+ c)])))]
208
+ [other
209
+ (error 'parse-content-disposition-header
210
+ "internal error, unexpected parse shape 2: ~e "
211
+ other)])]
212
+ [other
213
+ (list 'parsefail
214
+ (format
215
+ (string-append
216
+ "expected: byte string matching RFC6266 spec with "
217
+ "no RFC5987 ext-values, got: ~e " )
218
+ rhs))])))
219
+
220
+ ;; clean up a quoted string by removing the quotes and undoing escaping
221
+ (define (val-cleanup val)
222
+ (match val
223
+ [(? bytes? b) b]
224
+ [(list 'quoted l)
225
+ (apply bytes-append (for/list ([chunk (in-list l)])
226
+ (match chunk
227
+ [(? bytes? b) b]
228
+ [(list 'escaped eseq)
229
+ (subbytes eseq 1 2 )])))]))
230
+
231
+ (module+ test
232
+ (require rackunit)
233
+
234
+ (check-equal? (QDESCAPED #"\\\003 3 " )
235
+ (list '(escaped #"\\\003 " ) #" 3 " ))
236
+ (check-equal? ((orparse CLEANCHARSEQ QDESCAPED) #"\\\003 3 " )
237
+ (list '(escaped #"\\\003 " ) #" 3 " ))
238
+ (check-equal? (QTDSTR #"\"abc\\\003\\\"def\" " )
239
+ (list '(quoted (#"abc "
240
+ (escaped #"\\\003 " )
241
+ (escaped #"\\\" " )
242
+ #"def " ))
243
+ #"" ))
244
+
245
+ (check-equal?
246
+ (parse-content-disposition-header
247
+ #" form-data ;name=\"abcz\"; filename=\"abc\\\"d\" " )
248
+ '(#"form-data "
249
+ ((#"name " #"abcz " )
250
+ (#"filename " #"abc\"d " ))))
251
+
252
+ (check-equal? (TOKEN #"form-data ; " )
253
+ (list #"form-data " #" ; " ))
254
+
255
+ (check-equal? ((seq LWS TOKEN) #" form-data ; " )
256
+ (list (list 'LWS #"form-data " ) #" ; " ))
257
+
258
+ (check-equal? ((seq/ws TOKEN) #" form-data ; " )
259
+ (list (list #"form-data " ) #" ; " ))
260
+
261
+ (check-equal? (QTDSTR #"\"abcz\"; filename=\"abc\\\"d\"
262
+ zokbar=abc24 " )
263
+ (list '(quoted (#"abcz " ))
264
+ #"; filename=\"abc\\\"d\"
265
+ zokbar=abc24 " ))
266
+
267
+ (check-equal? (QTDSTR #"\"abc\\\"d\"
268
+ ; zokbar=abc24 " )
269
+ (list '(quoted (#"abc " (escaped #"\\\" " ) #"d " ))
270
+ #"
271
+ ; zokbar=abc24 " ))
272
+
273
+ (check-equal? (content-disposition-parser
274
+ #" form-data ;name=\"abcz\"; filename=\"abc\\\"d\"\r
275
+ ; zokbar=abc24 " )
276
+ (list `(#"form-data "
277
+ ((SEMI (#"name " EQ (quoted (#"abcz " ))))
278
+ (SEMI (#"filename " EQ (quoted (#"abc "
279
+ (escaped #"\\\" " )
280
+ #"d " ))))
281
+ (SEMI (#"zokbar " EQ #"abc24 " ))))
282
+ #"" ))
283
+
284
+ (check-equal? (QTDSTR #"\"filename=\" " )
285
+ (list '(quoted (#"filename= " )) #"" ))
286
+
287
+
288
+ (check-equal?
289
+ (content-disposition-parser
290
+ #"form-data; name=\"filename=\"; zokbar=\"dingo\"; filename=\"wallaby\" " )
291
+ (list `(#"form-data "
292
+ ((SEMI (#"name " EQ (quoted (#"filename= " ))))
293
+ (SEMI (#"zokbar " EQ (quoted (#"dingo " ))))
294
+ (SEMI (#"filename " EQ (quoted (#"wallaby " ))))))
295
+ #"" ))
296
+
297
+ (check-match
298
+ (parse-content-disposition-header
299
+ #"form-data; name=\"filen\"ame=\"; zokbar=\"dingo\"; filename=\"wallaby\" " )
300
+ (list 'parsefail (regexp #px"expected: byte string matching RFC6266 " )))
301
+
302
+ (check-match
303
+ (parse-content-disposition-header
304
+ #"form-data; name=\"filename=\"; zokbar*=\"dingo\"; filename=\"wallaby\" " )
305
+ (list 'parsefail (regexp #px"token ending with * " )))
306
+
307
+ )
308
+
309
+ ;; this code was used to generate the regexp for tokens. In principle,
310
+ ;; you shouldn't need this code unless you need to re-generate this
311
+ ;; regexp
312
+ (module background racket
313
+
314
+ (require rackunit)
315
+
316
+ ;; from RFC 2616:
317
+ #|token = 1*<any CHAR except CTLs or separators>
318
+ separators = "(" | ")" | "<" | ">" | "@"
319
+ | "," | ";" | ":" | "\" | <">
320
+ | "/" | "[" | "]" | "?" | "="
321
+ | "{" | "}" | SP | HT
322
+ |#
323
+
324
+ (define separators
325
+ (map (λ (s) (first (string->list s)))
326
+ '("( " ") " "< " "> " "@ "
327
+ ", " "; " ": " "\\ " "\" "
328
+ "/ " "[ " "] " "? " "= "
329
+ "{ " "} " " " "\t " )))
330
+
331
+ (define CTLs
332
+ (cons #\u007f
333
+ (for/list ([i (in-range 0 32 )])
334
+ (integer->char i))))
335
+
336
+ ;; add hyphen because it has to be treated
337
+ ;; specially in regexps:
338
+ (define separators-plus-ctls-plus-hyphen
339
+ (cons #\- (append separators CTLs)))
340
+
341
+ (define omitted-integers
342
+ (remove-duplicates
343
+ (sort (map char->integer separators-plus-ctls-plus-hyphen) <)))
344
+
345
+ (define ranges
346
+ (let loop ([range-begin 0 ]
347
+ [badchars omitted-integers])
348
+ (cond [(null? badchars)
349
+ (cond [(< range-begin 127 )
350
+ (list (list range-begin 126 ))]
351
+ [else (list)])]
352
+ [else
353
+ (define nextbad (first badchars))
354
+ (cond [(< range-begin nextbad)
355
+ (cons (list range-begin (sub1 nextbad))
356
+ (loop (add1 nextbad) (rest badchars)))]
357
+ [(= range-begin nextbad)
358
+ (loop (add1 range-begin)
359
+ (rest badchars))]
360
+ [else
361
+ (error 'impossible-i-thought
362
+ "~a ~a " range-begin nextbad)])])))
363
+
364
+ (define token-regexp-bstr
365
+ (string->bytes/utf-8
366
+ (call-with-output-string
367
+ (λ (port)
368
+ ;; adding the hyphen back in here:
369
+ (fprintf port "[- " )
370
+ (for/list ([r (in-list ranges)])
371
+ (cond [(equal? (first r) (second r))
372
+ (fprintf port "~a " (string (integer->char (first r))))]
373
+ [else
374
+ (fprintf port "~a~a~a " (string (integer->char (first r))) "- "
375
+ (string (integer->char (second r))))]))
376
+ (fprintf port "] " )))))
377
+
378
+ ;; check that it works:
379
+
380
+ (for ([i (in-range 0 128 )])
381
+ (define ch (integer->char i))
382
+ (cond [(member ch (append separators CTLs))
383
+ (check-pred (λ (ch) (not (regexp-match? token-regexp-bstr
384
+ (string ch))))
385
+ ch)]
386
+ [else
387
+ (check-pred (λ (ch) (regexp-match? token-regexp-bstr (string ch)))
388
+ ch)])))
0 commit comments