Skip to content

Commit d2c0f26

Browse files
committed
add code to parse content-disposition headers per RFC6266
1 parent 151538d commit d2c0f26

File tree

3 files changed

+480
-9
lines changed

3 files changed

+480
-9
lines changed
Lines changed: 388 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,388 @@
1+
#lang racket/base
2+
3+
(require racket/match
4+
racket/contract
5+
(only-in racket/list add-between))
6+
7+
(provide
8+
(contract-out [parse-content-disposition-header
9+
(-> bytes?
10+
(or/c
11+
(list/c 'parsefail string?)
12+
(list/c bytes?
13+
(listof (list/c bytes? bytes?)))))]))
14+
15+
(struct parsefail exn ())
16+
17+
;; this file parses the Content-Disposition line of HTTP headers
18+
19+
;; from RFC6266:
20+
#|content-disposition = "Content-Disposition" ":"
21+
disposition-type *( ";" disposition-parm )
22+
23+
disposition-type = "inline" | "attachment" | disp-ext-type
24+
; case-insensitive
25+
disp-ext-type = token
26+
27+
disposition-parm = filename-parm | disp-ext-parm
28+
29+
filename-parm = "filename" "=" value
30+
| "filename*" "=" ext-value
31+
32+
disp-ext-parm = token "=" value
33+
| ext-token "=" ext-value
34+
ext-token = <the characters in token, followed by "*">
35+
|#
36+
37+
;; HOWEVER: we're explicitly giving up on RFC5987-style ext-values. If
38+
;; we see a parm or filename-parm whose token ends with a star, we
39+
;; just give up.
40+
41+
;; SUPER-LIGHTWEIGHT PARSER FRAMEWORK:
42+
43+
;; this is about the most lightweight parser framework that I could
44+
;; come up with:
45+
46+
;; a parser returns a list containing a parsed value and a byte string
47+
;; containing the remainder, or it returns false. Only one parsing is
48+
;; possible with this scheme. Also, this parser does not support
49+
;; backtracking; the kleene star insists on eating the rest of the
50+
;; input. This should ensure that every parser created with this framework
51+
;; is nice and fast.
52+
53+
;; it would be fun to convert this to TR, and I think it would work fine.
54+
;; I don't have time to do it right now... :(
55+
56+
;; given a bunch of parsers, use each of them and combine
57+
;; their parsed values in a list. This is basically just the I/O
58+
;; monad. Or, to be more specific, just the "I" monad. Er, with
59+
;; the exception monad mixed in. Kinda.
60+
(define (seq . parsers)
61+
(cond [(null? parsers) (λ (bstr) (list '() bstr))]
62+
[else (λ (bstr)
63+
(match ((car parsers) bstr)
64+
[(list yay leftover)
65+
((postproc (apply seq (cdr parsers))
66+
(λ (restyay) (cons yay restyay)))
67+
leftover)]
68+
[#f #f]))]))
69+
70+
;; given two parsers, use the first one that succeeds. No backtracking.
71+
(define (orparse p1 p2)
72+
(λ (bstr)
73+
(match (p1 bstr)
74+
[(list yay leftover) (list yay leftover)]
75+
[#f (p2 bstr)])))
76+
77+
78+
;; Kleene star: given a parser, parse until you can't parse any more
79+
(define (kstar parser)
80+
(λ (bstr)
81+
(match (parser bstr)
82+
[(list yay leftover)
83+
((postproc (kstar parser)
84+
(λ (v) (cons yay v)))
85+
leftover)]
86+
[#f (list '() bstr)])))
87+
88+
89+
;; given a regexp (good idea for it to start with ^),
90+
;; and a function to apply to the matched bytes before
91+
;; returning, return a parser for that regexp
92+
(define (rx-matcher regexp postproc)
93+
(λ (bstr)
94+
(define maybe-matches (regexp-match-positions regexp bstr))
95+
(match maybe-matches
96+
[(list (cons 0 end) other ...)
97+
(list (postproc (subbytes bstr 0 end))
98+
(subbytes bstr end))]
99+
[other #f])))
100+
101+
102+
;; given a regexp (good idea for it to start with ^),
103+
;; return a parser for that regexp
104+
(define (rx-matcher/raw regexp)
105+
(rx-matcher regexp (λ (x) x)))
106+
107+
;; given a regexp (good idea for it to start with ^),
108+
;; and a constant, return an rx parser that just returns
109+
;; the contstant (if it matches)
110+
(define (rx-matcher/const regexp const)
111+
(rx-matcher regexp (λ (_) const)))
112+
113+
;; given a parser, return a new parser that strips 'v' from the list
114+
;; in the result position (if the whole parse result is #f, just
115+
;; return it). Doesn't recur into sublists.
116+
(define (strip v p)
117+
(postproc p (λ (l) (filter (λ (elt) (not (equal? elt v))) l))))
118+
119+
;; apply the given 'pp-fun' to the value in the result position
120+
;; of the parser. If the parser fails, just return the fail
121+
(define (postproc parser pp-fun)
122+
(λ (bstr)
123+
(match (parser bstr)
124+
[(list result leftover)
125+
(list (pp-fun result) leftover)]
126+
[#f #f])))
127+
128+
;; try to use parser p. if it fails, pretend it succeeded, and use
129+
;; the given value as the result
130+
(define (opt p val)
131+
(orparse p (λ (bstr) (list val bstr))))
132+
133+
;; linear white space
134+
;; NB: it looks like the request parser actually cleans up line breaks
135+
;; for us... no problem.
136+
(define LWS (rx-matcher/const #px#"^(\r\n)?[ \t]+" 'LWS))
137+
;; optional linear white space
138+
(define OPTLWS (opt LWS 'LWS))
139+
;; optional leading whitespace
140+
;; IMPL: can't just staple OPTLWS on the front of a seq, because
141+
;; then on #""i n a kstar it gets partway through (viz, the optlws) and
142+
;; then thinks it's failed partway through a seq. Grr.
143+
(define (leadingLWS parser)
144+
(orparse (postproc (seq LWS parser) cadr) parser))
145+
146+
;; a sequence where linear whitespace is allowed (and discarded)
147+
;; before and between every pair of elements
148+
(define (seq/ws . parsers)
149+
(strip 'LWS (leadingLWS (apply seq (add-between parsers OPTLWS)))))
150+
151+
152+
(define SEMI (rx-matcher/const #px#"^;" 'SEMI))
153+
(define EQ (rx-matcher/const #px#"^=" 'EQ))
154+
(define DQ (rx-matcher/const #px#"^\"" 'DQ))
155+
156+
;; a quoted string. a quote followed by any character from 32-255 not
157+
;; including backslash or quote, but optionally a backslash followed
158+
;; by any char (can only be 0-127), and finally a close quote.
159+
;; IMPL NOTE: you can do all of this with a single regexp, but you
160+
;; wind up doing all the same work over again in cleaning up the
161+
;; string.
162+
(define CLEANCHARSEQ (rx-matcher/raw #px#"^([ -!#-[]|[\\]-\377])+"))
163+
(define QDESCAPED (rx-matcher #px#"^\\\\[\0-\177]"
164+
(λ (v) (list 'escaped v))))
165+
(define QTDSTR
166+
(postproc (seq DQ (kstar (orparse CLEANCHARSEQ QDESCAPED)) DQ)
167+
(λ (v) (list 'quoted (cadr v)))))
168+
(define TOKEN (rx-matcher/raw #px#"^([-!#-'*-+.0-9A-Z^-z|~]+)"))
169+
(define VALUE (orparse TOKEN QTDSTR))
170+
171+
;; give up if we see a token ending with a star; these signal
172+
;; RFC5987 ext-values, and we don't handle them correctly.
173+
(define CLAUSE
174+
(postproc
175+
(seq/ws TOKEN EQ VALUE)
176+
(λ (v)
177+
(when (regexp-match #px#"\\*$" (car v))
178+
(raise
179+
(parsefail
180+
(format "token ending with * indicates unsupported ext-value: ~e"
181+
(car v))
182+
(current-continuation-marks))))
183+
v)))
184+
185+
(define content-disposition-parser
186+
(seq/ws TOKEN (kstar (seq/ws SEMI CLAUSE))))
187+
188+
189+
;; given the right-hand-side of a content-disposition header
190+
;; line, return a list containing the content-disposition-type
191+
;; and a list of token/value lists
192+
(define (parse-content-disposition-header rhs)
193+
(with-handlers ([parsefail?
194+
(lambda (pf)
195+
(list 'parsefail (exn-message pf)))])
196+
(match (content-disposition-parser rhs)
197+
[(list matched #"")
198+
(match matched
199+
[(list ty clauses)
200+
(list ty (for/list ([c (in-list clauses)])
201+
(match c
202+
[(list 'SEMI (list tok 'EQ val))
203+
(list tok (val-cleanup val))]
204+
[other (error
205+
'parse-content-disposition-header
206+
"internal error, unexpected parse shape: ~e"
207+
c)])))]
208+
[other
209+
(error 'parse-content-disposition-header
210+
"internal error, unexpected parse shape 2: ~e"
211+
other)])]
212+
[other
213+
(list 'parsefail
214+
(format
215+
(string-append
216+
"expected: byte string matching RFC6266 spec with "
217+
"no RFC5987 ext-values, got: ~e")
218+
rhs))])))
219+
220+
;; clean up a quoted string by removing the quotes and undoing escaping
221+
(define (val-cleanup val)
222+
(match val
223+
[(? bytes? b) b]
224+
[(list 'quoted l)
225+
(apply bytes-append (for/list ([chunk (in-list l)])
226+
(match chunk
227+
[(? bytes? b) b]
228+
[(list 'escaped eseq)
229+
(subbytes eseq 1 2)])))]))
230+
231+
(module+ test
232+
(require rackunit)
233+
234+
(check-equal? (QDESCAPED #"\\\003 3")
235+
(list '(escaped #"\\\003") #" 3"))
236+
(check-equal? ((orparse CLEANCHARSEQ QDESCAPED) #"\\\003 3")
237+
(list '(escaped #"\\\003") #" 3"))
238+
(check-equal? (QTDSTR #"\"abc\\\003\\\"def\"")
239+
(list '(quoted (#"abc"
240+
(escaped #"\\\003")
241+
(escaped #"\\\"")
242+
#"def"))
243+
#""))
244+
245+
(check-equal?
246+
(parse-content-disposition-header
247+
#" form-data ;name=\"abcz\"; filename=\"abc\\\"d\"")
248+
'(#"form-data"
249+
((#"name" #"abcz")
250+
(#"filename" #"abc\"d"))))
251+
252+
(check-equal? (TOKEN #"form-data ;")
253+
(list #"form-data" #" ;"))
254+
255+
(check-equal? ((seq LWS TOKEN) #" form-data ;")
256+
(list (list 'LWS #"form-data") #" ;"))
257+
258+
(check-equal? ((seq/ws TOKEN) #" form-data ;")
259+
(list (list #"form-data") #" ;"))
260+
261+
(check-equal? (QTDSTR #"\"abcz\"; filename=\"abc\\\"d\"
262+
zokbar=abc24")
263+
(list '(quoted (#"abcz"))
264+
#"; filename=\"abc\\\"d\"
265+
zokbar=abc24"))
266+
267+
(check-equal? (QTDSTR #"\"abc\\\"d\"
268+
; zokbar=abc24")
269+
(list '(quoted (#"abc" (escaped #"\\\"") #"d"))
270+
#"
271+
; zokbar=abc24"))
272+
273+
(check-equal? (content-disposition-parser
274+
#" form-data ;name=\"abcz\"; filename=\"abc\\\"d\"\r
275+
; zokbar=abc24")
276+
(list `(#"form-data"
277+
((SEMI (#"name" EQ (quoted (#"abcz"))))
278+
(SEMI (#"filename" EQ (quoted (#"abc"
279+
(escaped #"\\\"")
280+
#"d"))))
281+
(SEMI (#"zokbar" EQ #"abc24"))))
282+
#""))
283+
284+
(check-equal? (QTDSTR #"\"filename=\"")
285+
(list '(quoted (#"filename=")) #""))
286+
287+
288+
(check-equal?
289+
(content-disposition-parser
290+
#"form-data; name=\"filename=\"; zokbar=\"dingo\"; filename=\"wallaby\"")
291+
(list `(#"form-data"
292+
((SEMI (#"name" EQ (quoted (#"filename="))))
293+
(SEMI (#"zokbar" EQ (quoted (#"dingo"))))
294+
(SEMI (#"filename" EQ (quoted (#"wallaby"))))))
295+
#""))
296+
297+
(check-match
298+
(parse-content-disposition-header
299+
#"form-data; name=\"filen\"ame=\"; zokbar=\"dingo\"; filename=\"wallaby\"")
300+
(list 'parsefail (regexp #px"expected: byte string matching RFC6266")))
301+
302+
(check-match
303+
(parse-content-disposition-header
304+
#"form-data; name=\"filename=\"; zokbar*=\"dingo\"; filename=\"wallaby\"")
305+
(list 'parsefail (regexp #px"token ending with *")))
306+
307+
)
308+
309+
;; this code was used to generate the regexp for tokens. In principle,
310+
;; you shouldn't need this code unless you need to re-generate this
311+
;; regexp
312+
(module background racket
313+
314+
(require rackunit)
315+
316+
;; from RFC 2616:
317+
#|token = 1*<any CHAR except CTLs or separators>
318+
separators = "(" | ")" | "<" | ">" | "@"
319+
| "," | ";" | ":" | "\" | <">
320+
| "/" | "[" | "]" | "?" | "="
321+
| "{" | "}" | SP | HT
322+
|#
323+
324+
(define separators
325+
(map (λ (s) (first (string->list s)))
326+
'("(" ")" "<" ">" "@"
327+
"," ";" ":" "\\" "\""
328+
"/" "[" "]" "?" "="
329+
"{" "}" " " "\t")))
330+
331+
(define CTLs
332+
(cons #\u007f
333+
(for/list ([i (in-range 0 32)])
334+
(integer->char i))))
335+
336+
;; add hyphen because it has to be treated
337+
;; specially in regexps:
338+
(define separators-plus-ctls-plus-hyphen
339+
(cons #\- (append separators CTLs)))
340+
341+
(define omitted-integers
342+
(remove-duplicates
343+
(sort (map char->integer separators-plus-ctls-plus-hyphen) <)))
344+
345+
(define ranges
346+
(let loop ([range-begin 0]
347+
[badchars omitted-integers])
348+
(cond [(null? badchars)
349+
(cond [(< range-begin 127)
350+
(list (list range-begin 126))]
351+
[else (list)])]
352+
[else
353+
(define nextbad (first badchars))
354+
(cond [(< range-begin nextbad)
355+
(cons (list range-begin (sub1 nextbad))
356+
(loop (add1 nextbad) (rest badchars)))]
357+
[(= range-begin nextbad)
358+
(loop (add1 range-begin)
359+
(rest badchars))]
360+
[else
361+
(error 'impossible-i-thought
362+
"~a ~a" range-begin nextbad)])])))
363+
364+
(define token-regexp-bstr
365+
(string->bytes/utf-8
366+
(call-with-output-string
367+
(λ (port)
368+
;; adding the hyphen back in here:
369+
(fprintf port "[-")
370+
(for/list ([r (in-list ranges)])
371+
(cond [(equal? (first r) (second r))
372+
(fprintf port "~a" (string (integer->char (first r))))]
373+
[else
374+
(fprintf port "~a~a~a" (string (integer->char (first r))) "-"
375+
(string (integer->char (second r))))]))
376+
(fprintf port "]")))))
377+
378+
;; check that it works:
379+
380+
(for ([i (in-range 0 128)])
381+
(define ch (integer->char i))
382+
(cond [(member ch (append separators CTLs))
383+
(check-pred (λ (ch) (not (regexp-match? token-regexp-bstr
384+
(string ch))))
385+
ch)]
386+
[else
387+
(check-pred (λ (ch) (regexp-match? token-regexp-bstr (string ch)))
388+
ch)])))

0 commit comments

Comments
 (0)