@@ -16,6 +16,14 @@ class BaseRobotTest:
16
16
bad = []
17
17
site_maps = None
18
18
19
+ def __init_subclass__ (cls ):
20
+ super ().__init_subclass__ ()
21
+ # Remove tests that do nothing.
22
+ if not cls .good :
23
+ cls .test_good_urls = None
24
+ if not cls .bad :
25
+ cls .test_bad_urls = None
26
+
19
27
def setUp (self ):
20
28
lines = io .StringIO (self .robots_txt ).readlines ()
21
29
self .parser = urllib .robotparser .RobotFileParser ()
@@ -249,15 +257,77 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
249
257
bad = ['/some/path' ]
250
258
251
259
252
- class EmptyQueryStringTest (BaseRobotTest , unittest .TestCase ):
253
- # normalize the URL first (#17403)
260
+ class PercentEncodingTest (BaseRobotTest , unittest .TestCase ):
254
261
robots_txt = """\
255
262
User-agent: *
256
- Allow: /some/path?
257
- Disallow: /another/path?
258
- """
259
- good = ['/some/path?' ]
260
- bad = ['/another/path?' ]
263
+ Disallow: /a1/Z-._~ # unreserved characters
264
+ Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
265
+ Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
266
+ Disallow: /u2/%f0%9f%90%8d
267
+ Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
268
+ Disallow: /v1/%F0 # percent-encoded non-ASCII octet
269
+ Disallow: /v2/%f0
270
+ Disallow: /v3/\udcf0 # raw non-ASCII octet
271
+ Disallow: /p1%xy # raw percent
272
+ Disallow: /p2%
273
+ Disallow: /p3%25xy # percent-encoded percent
274
+ Disallow: /p4%2525xy # double percent-encoded percent
275
+ Disallow: /john%20smith # space
276
+ Disallow: /john doe
277
+ Disallow: /trailingspace%20
278
+ Disallow: /query?q=v # query
279
+ Disallow: /query2?q=%3F
280
+ Disallow: /query3?q=?
281
+ Disallow: /emptyquery?
282
+ Disallow: /question%3Fq=v # not query
283
+ Disallow: /hash%23f # not fragment
284
+ Disallow: /dollar%24
285
+ Disallow: /asterisk%2A
286
+ Disallow: /sub/dir
287
+ Disallow: /slash%2F
288
+ """
289
+ good = [
290
+ '/u1/%F0' , '/u1/%f0' ,
291
+ '/u2/%F0' , '/u2/%f0' ,
292
+ '/u3/%F0' , '/u3/%f0' ,
293
+ '/p1%2525xy' , '/p2%f0' , '/p3%2525xy' , '/p4%xy' , '/p4%25xy' ,
294
+ '/query%3Fq=v' , '/question?q=v' ,
295
+ '/emptyquery' ,
296
+ '/dollar' , '/asterisk' ,
297
+ ]
298
+ bad = [
299
+ '/a1/Z-._~' , '/a1/%5A%2D%2E%5F%7E' ,
300
+ '/a2/Z-._~' , '/a2/%5A%2D%2E%5F%7E' ,
301
+ '/u1/%F0%9F%90%8D' , '/u1/%f0%9f%90%8d' , '/u1/\U0001f40d ' ,
302
+ '/u2/%F0%9F%90%8D' , '/u2/%f0%9f%90%8d' , '/u2/\U0001f40d ' ,
303
+ '/u3/%F0%9F%90%8D' , '/u3/%f0%9f%90%8d' , '/u3/\U0001f40d ' ,
304
+ '/v1/%F0' , '/v1/%f0' , '/v1/\udcf0 ' , '/v1/\U0001f40d ' ,
305
+ '/v2/%F0' , '/v2/%f0' , '/v2/\udcf0 ' , '/v2/\U0001f40d ' ,
306
+ '/v3/%F0' , '/v3/%f0' , '/v3/\udcf0 ' , '/v3/\U0001f40d ' ,
307
+ '/p1%xy' , '/p1%25xy' ,
308
+ '/p2%' , '/p2%25' , '/p2%2525' , '/p2%xy' ,
309
+ '/p3%xy' , '/p3%25xy' ,
310
+ '/p4%2525xy' ,
311
+ '/john%20smith' , '/john smith' ,
312
+ '/john%20doe' , '/john doe' ,
313
+ '/trailingspace%20' , '/trailingspace ' ,
314
+ '/query?q=v' , '/question%3Fq=v' ,
315
+ '/query2?q=?' , '/query2?q=%3F' ,
316
+ '/query3?q=?' , '/query3?q=%3F' ,
317
+ '/emptyquery?' , '/emptyquery?q=v' ,
318
+ '/hash#f' , '/hash%23f' ,
319
+ '/dollar$' , '/dollar%24' ,
320
+ '/asterisk*' , '/asterisk%2A' ,
321
+ '/sub/dir' , '/sub%2Fdir' ,
322
+ '/slash%2F' , '/slash/' ,
323
+ ]
324
+ # other reserved characters
325
+ for c in ":/#[]@!$&'()*+,;=" :
326
+ robots_txt += f'Disallow: /raw{ c } \n Disallow: /pc%{ ord (c ):02X} \n '
327
+ bad .append (f'/raw{ c } ' )
328
+ bad .append (f'/raw%{ ord (c ):02X} ' )
329
+ bad .append (f'/pc{ c } ' )
330
+ bad .append (f'/pc%{ ord (c ):02X} ' )
261
331
262
332
263
333
class DefaultEntryTest (BaseRequestRateTest , unittest .TestCase ):
@@ -299,26 +369,17 @@ def test_string_formatting(self):
299
369
self .assertEqual (str (self .parser ), self .expected_output )
300
370
301
371
302
- class RobotHandler (BaseHTTPRequestHandler ):
303
-
304
- def do_GET (self ):
305
- self .send_error (403 , "Forbidden access" )
306
-
307
- def log_message (self , format , * args ):
308
- pass
309
-
310
-
311
372
@unittest .skipUnless (
312
373
support .has_socket_support ,
313
374
"Socket server requires working socket."
314
375
)
315
- class PasswordProtectedSiteTestCase ( unittest . TestCase ) :
376
+ class BaseLocalNetworkTestCase :
316
377
317
378
def setUp (self ):
318
379
# clear _opener global variable
319
380
self .addCleanup (urllib .request .urlcleanup )
320
381
321
- self .server = HTTPServer ((socket_helper .HOST , 0 ), RobotHandler )
382
+ self .server = HTTPServer ((socket_helper .HOST , 0 ), self . RobotHandler )
322
383
323
384
self .t = threading .Thread (
324
385
name = 'HTTPServer serving' ,
@@ -335,6 +396,57 @@ def tearDown(self):
335
396
self .t .join ()
336
397
self .server .server_close ()
337
398
399
+
400
+ SAMPLE_ROBOTS_TXT = b'''\
401
+ User-agent: test_robotparser
402
+ Disallow: /utf8/\xf0 \x9f \x90 \x8d
403
+ Disallow: /non-utf8/\xf0
404
+ Disallow: //[spam]/path
405
+ '''
406
+
407
+
408
+ class LocalNetworkTestCase (BaseLocalNetworkTestCase , unittest .TestCase ):
409
+ class RobotHandler (BaseHTTPRequestHandler ):
410
+
411
+ def do_GET (self ):
412
+ self .send_response (200 )
413
+ self .end_headers ()
414
+ self .wfile .write (SAMPLE_ROBOTS_TXT )
415
+
416
+ def log_message (self , format , * args ):
417
+ pass
418
+
419
+ @threading_helper .reap_threads
420
+ def testRead (self ):
421
+ # Test that reading a weird robots.txt doesn't fail.
422
+ addr = self .server .server_address
423
+ url = f'http://{ socket_helper .HOST } :{ addr [1 ]} '
424
+ robots_url = url + '/robots.txt'
425
+ parser = urllib .robotparser .RobotFileParser ()
426
+ parser .set_url (robots_url )
427
+ parser .read ()
428
+ # And it can even interpret the weird paths in some reasonable way.
429
+ agent = 'test_robotparser'
430
+ self .assertTrue (parser .can_fetch (agent , robots_url ))
431
+ self .assertTrue (parser .can_fetch (agent , url + '/utf8/' ))
432
+ self .assertFalse (parser .can_fetch (agent , url + '/utf8/\U0001f40d ' ))
433
+ self .assertFalse (parser .can_fetch (agent , url + '/utf8/%F0%9F%90%8D' ))
434
+ self .assertFalse (parser .can_fetch (agent , url + '/utf8/\U0001f40d ' ))
435
+ self .assertTrue (parser .can_fetch (agent , url + '/non-utf8/' ))
436
+ self .assertFalse (parser .can_fetch (agent , url + '/non-utf8/%F0' ))
437
+ self .assertFalse (parser .can_fetch (agent , url + '/non-utf8/\U0001f40d ' ))
438
+ self .assertFalse (parser .can_fetch (agent , url + '/%2F[spam]/path' ))
439
+
440
+
441
+ class PasswordProtectedSiteTestCase (BaseLocalNetworkTestCase , unittest .TestCase ):
442
+ class RobotHandler (BaseHTTPRequestHandler ):
443
+
444
+ def do_GET (self ):
445
+ self .send_error (403 , "Forbidden access" )
446
+
447
+ def log_message (self , format , * args ):
448
+ pass
449
+
338
450
@threading_helper .reap_threads
339
451
def testPasswordProtectedSite (self ):
340
452
addr = self .server .server_address
0 commit comments