Skip to content

Commit 9c026fa

Browse files
gh-111788: Fix parsing and normalization of rules and URLs in robotparser
* Distinguish the query separator from a percent-encoded ?. * Fix support of non-UTF-8 robots.txt files. * Don't fail trying to parse weird paths.
1 parent 06f8d7a commit 9c026fa

File tree

3 files changed

+155
-28
lines changed

3 files changed

+155
-28
lines changed

Lib/test/test_robotparser.py

Lines changed: 130 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@ class BaseRobotTest:
1616
bad = []
1717
site_maps = None
1818

19+
def __init_subclass__(cls):
20+
super().__init_subclass__()
21+
# Remove tests that do nothing.
22+
if not cls.good:
23+
cls.test_good_urls = None
24+
if not cls.bad:
25+
cls.test_bad_urls = None
26+
1927
def setUp(self):
2028
lines = io.StringIO(self.robots_txt).readlines()
2129
self.parser = urllib.robotparser.RobotFileParser()
@@ -249,15 +257,77 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
249257
bad = ['/some/path']
250258

251259

252-
class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
253-
# normalize the URL first (#17403)
260+
class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
254261
robots_txt = """\
255262
User-agent: *
256-
Allow: /some/path?
257-
Disallow: /another/path?
258-
"""
259-
good = ['/some/path?']
260-
bad = ['/another/path?']
263+
Disallow: /a1/Z-._~ # unreserved characters
264+
Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
265+
Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
266+
Disallow: /u2/%f0%9f%90%8d
267+
Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
268+
Disallow: /v1/%F0 # percent-encoded non-ASCII octet
269+
Disallow: /v2/%f0
270+
Disallow: /v3/\udcf0 # raw non-ASCII octet
271+
Disallow: /p1%xy # raw percent
272+
Disallow: /p2%
273+
Disallow: /p3%25xy # percent-encoded percent
274+
Disallow: /p4%2525xy # double percent-encoded percent
275+
Disallow: /john%20smith # space
276+
Disallow: /john doe
277+
Disallow: /trailingspace%20
278+
Disallow: /query?q=v # query
279+
Disallow: /query2?q=%3F
280+
Disallow: /query3?q=?
281+
Disallow: /emptyquery?
282+
Disallow: /question%3Fq=v # not query
283+
Disallow: /hash%23f # not fragment
284+
Disallow: /dollar%24
285+
Disallow: /asterisk%2A
286+
Disallow: /sub/dir
287+
Disallow: /slash%2F
288+
"""
289+
good = [
290+
'/u1/%F0', '/u1/%f0',
291+
'/u2/%F0', '/u2/%f0',
292+
'/u3/%F0', '/u3/%f0',
293+
'/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',
294+
'/query%3Fq=v', '/question?q=v',
295+
'/emptyquery',
296+
'/dollar', '/asterisk',
297+
]
298+
bad = [
299+
'/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',
300+
'/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',
301+
'/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',
302+
'/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',
303+
'/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',
304+
'/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',
305+
'/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',
306+
'/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',
307+
'/p1%xy', '/p1%25xy',
308+
'/p2%', '/p2%25', '/p2%2525', '/p2%xy',
309+
'/p3%xy', '/p3%25xy',
310+
'/p4%2525xy',
311+
'/john%20smith', '/john smith',
312+
'/john%20doe', '/john doe',
313+
'/trailingspace%20', '/trailingspace ',
314+
'/query?q=v', '/question%3Fq=v',
315+
'/query2?q=?', '/query2?q=%3F',
316+
'/query3?q=?', '/query3?q=%3F',
317+
'/emptyquery?', '/emptyquery?q=v',
318+
'/hash#f', '/hash%23f',
319+
'/dollar$', '/dollar%24',
320+
'/asterisk*', '/asterisk%2A',
321+
'/sub/dir', '/sub%2Fdir',
322+
'/slash%2F', '/slash/',
323+
]
324+
# other reserved characters
325+
for c in ":/#[]@!$&'()*+,;=":
326+
robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'
327+
bad.append(f'/raw{c}')
328+
bad.append(f'/raw%{ord(c):02X}')
329+
bad.append(f'/pc{c}')
330+
bad.append(f'/pc%{ord(c):02X}')
261331

262332

263333
class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
@@ -299,26 +369,17 @@ def test_string_formatting(self):
299369
self.assertEqual(str(self.parser), self.expected_output)
300370

301371

302-
class RobotHandler(BaseHTTPRequestHandler):
303-
304-
def do_GET(self):
305-
self.send_error(403, "Forbidden access")
306-
307-
def log_message(self, format, *args):
308-
pass
309-
310-
311372
@unittest.skipUnless(
312373
support.has_socket_support,
313374
"Socket server requires working socket."
314375
)
315-
class PasswordProtectedSiteTestCase(unittest.TestCase):
376+
class BaseLocalNetworkTestCase:
316377

317378
def setUp(self):
318379
# clear _opener global variable
319380
self.addCleanup(urllib.request.urlcleanup)
320381

321-
self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
382+
self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
322383

323384
self.t = threading.Thread(
324385
name='HTTPServer serving',
@@ -335,6 +396,57 @@ def tearDown(self):
335396
self.t.join()
336397
self.server.server_close()
337398

399+
400+
SAMPLE_ROBOTS_TXT = b'''\
401+
User-agent: test_robotparser
402+
Disallow: /utf8/\xf0\x9f\x90\x8d
403+
Disallow: /non-utf8/\xf0
404+
Disallow: //[spam]/path
405+
'''
406+
407+
408+
class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
409+
class RobotHandler(BaseHTTPRequestHandler):
410+
411+
def do_GET(self):
412+
self.send_response(200)
413+
self.end_headers()
414+
self.wfile.write(SAMPLE_ROBOTS_TXT)
415+
416+
def log_message(self, format, *args):
417+
pass
418+
419+
@threading_helper.reap_threads
420+
def testRead(self):
421+
# Test that reading a weird robots.txt doesn't fail.
422+
addr = self.server.server_address
423+
url = f'http://{socket_helper.HOST}:{addr[1]}'
424+
robots_url = url + '/robots.txt'
425+
parser = urllib.robotparser.RobotFileParser()
426+
parser.set_url(robots_url)
427+
parser.read()
428+
# And it can even interpret the weird paths in some reasonable way.
429+
agent = 'test_robotparser'
430+
self.assertTrue(parser.can_fetch(agent, robots_url))
431+
self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))
432+
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
433+
self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))
434+
self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
435+
self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))
436+
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))
437+
self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))
438+
self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
439+
440+
441+
class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
442+
class RobotHandler(BaseHTTPRequestHandler):
443+
444+
def do_GET(self):
445+
self.send_error(403, "Forbidden access")
446+
447+
def log_message(self, format, *args):
448+
pass
449+
338450
@threading_helper.reap_threads
339451
def testPasswordProtectedSite(self):
340452
addr = self.server.server_address

Lib/urllib/robotparser.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,18 @@
2020
RequestRate = collections.namedtuple("RequestRate", "requests seconds")
2121

2222

23+
def normalize(path):
24+
unquoted = urllib.parse.unquote(path, errors='surrogateescape')
25+
return urllib.parse.quote(unquoted, errors='surrogateescape')
26+
27+
def normalize_path(path):
28+
path, sep, query = path.partition('?')
29+
path = normalize(path)
30+
if sep:
31+
path += '?' + normalize(query)
32+
return path
33+
34+
2335
class RobotFileParser:
2436
""" This class provides a set of methods to read, parse and answer
2537
questions about a single robots.txt file.
@@ -55,7 +67,7 @@ def modified(self):
5567
def set_url(self, url):
5668
"""Sets the URL referring to a robots.txt file."""
5769
self.url = url
58-
self.host, self.path = urllib.parse.urlparse(url)[1:3]
70+
self.host, self.path = urllib.parse.urlsplit(url)[1:3]
5971

6072
def read(self):
6173
"""Reads the robots.txt URL and feeds it to the parser."""
@@ -69,7 +81,7 @@ def read(self):
6981
err.close()
7082
else:
7183
raw = f.read()
72-
self.parse(raw.decode("utf-8").splitlines())
84+
self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
7385

7486
def _add_entry(self, entry):
7587
if "*" in entry.useragents:
@@ -113,7 +125,7 @@ def parse(self, lines):
113125
line = line.split(':', 1)
114126
if len(line) == 2:
115127
line[0] = line[0].strip().lower()
116-
line[1] = urllib.parse.unquote(line[1].strip())
128+
line[1] = line[1].strip()
117129
if line[0] == "user-agent":
118130
if state == 2:
119131
self._add_entry(entry)
@@ -167,10 +179,11 @@ def can_fetch(self, useragent, url):
167179
return False
168180
# search for given user agent matches
169181
# the first match counts
170-
parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
171-
url = urllib.parse.urlunparse(('','',parsed_url.path,
172-
parsed_url.params,parsed_url.query, parsed_url.fragment))
173-
url = urllib.parse.quote(url)
182+
# TODO: The private API is used in order to preserve an empty query.
183+
# This is temporary until the public API starts supporting this feature.
184+
parsed_url = urllib.parse._urlsplit(url, '')
185+
url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
186+
url = normalize_path(url)
174187
if not url:
175188
url = "/"
176189
for entry in self.entries:
@@ -213,16 +226,14 @@ def __str__(self):
213226
entries = entries + [self.default_entry]
214227
return '\n\n'.join(map(str, entries))
215228

216-
217229
class RuleLine:
218230
"""A rule line is a single "Allow:" (allowance==True) or "Disallow:"
219231
(allowance==False) followed by a path."""
220232
def __init__(self, path, allowance):
221233
if path == '' and not allowance:
222234
# an empty value means allow all
223235
allowance = True
224-
path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
225-
self.path = urllib.parse.quote(path)
236+
self.path = normalize_path(path)
226237
self.allowance = allowance
227238

228239
def applies_to(self, filename):
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Fix parsing and normalization of the ``robots.txt`` rules and URLs in the
2+
:mod:`robotparser` module. Distinguish the query separator from
3+
a percent-encoded ``?``. Fix support of non-UTF-8 ``robots.txt`` files.
4+
Don't fail trying to parse weird paths.

0 commit comments

Comments
 (0)