gh-111788: Fix parsing and normalization of rules and URLs in robotparser

serhiy-storchaka · serhiy-storchaka · commit 9c026faccdea · 2025-09-04T18:20:54.000+03:00
* Distinguish the query separator from a percent-encoded ?.
* Fix support of non-UTF-8 robots.txt files.
* Don't fail trying to parse weird paths.
diff --git a/Lib/test/test_robotparser.py b/Lib/test/test_robotparser.py
@@ -16,6 +16,14 @@ class BaseRobotTest:
     bad = []
     site_maps = None
 
+    def __init_subclass__(cls):
+        super().__init_subclass__()
+        # Remove tests that do nothing.
+        if not cls.good:
+            cls.test_good_urls = None
+        if not cls.bad:
+            cls.test_bad_urls = None
+
     def setUp(self):
         lines = io.StringIO(self.robots_txt).readlines()
         self.parser = urllib.robotparser.RobotFileParser()
@@ -249,15 +257,77 @@ class UseFirstUserAgentWildcardTest(BaseRobotTest, unittest.TestCase):
     bad = ['/some/path']
 
 
-class EmptyQueryStringTest(BaseRobotTest, unittest.TestCase):
-    # normalize the URL first (#17403)
+class PercentEncodingTest(BaseRobotTest, unittest.TestCase):
     robots_txt = """\
 User-agent: *
-Allow: /some/path?
-Disallow: /another/path?
-    """
-    good = ['/some/path?']
-    bad = ['/another/path?']
+Disallow: /a1/Z-._~ # unreserved characters
+Disallow: /a2/%5A%2D%2E%5F%7E # percent-encoded unreserved characters
+Disallow: /u1/%F0%9F%90%8D # percent-encoded ASCII Unicode character
+Disallow: /u2/%f0%9f%90%8d
+Disallow: /u3/\U0001f40d # raw non-ASCII Unicode character
+Disallow: /v1/%F0 # percent-encoded non-ASCII octet
+Disallow: /v2/%f0
+Disallow: /v3/\udcf0 # raw non-ASCII octet
+Disallow: /p1%xy # raw percent
+Disallow: /p2%
+Disallow: /p3%25xy # percent-encoded percent
+Disallow: /p4%2525xy # double percent-encoded percent
+Disallow: /john%20smith # space
+Disallow: /john doe
+Disallow: /trailingspace%20
+Disallow: /query?q=v # query
+Disallow: /query2?q=%3F
+Disallow: /query3?q=?
+Disallow: /emptyquery?
+Disallow: /question%3Fq=v # not query
+Disallow: /hash%23f # not fragment
+Disallow: /dollar%24
+Disallow: /asterisk%2A
+Disallow: /sub/dir
+Disallow: /slash%2F
+"""
+    good = [
+        '/u1/%F0', '/u1/%f0',
+        '/u2/%F0', '/u2/%f0',
+        '/u3/%F0', '/u3/%f0',
+        '/p1%2525xy', '/p2%f0', '/p3%2525xy', '/p4%xy', '/p4%25xy',
+        '/query%3Fq=v', '/question?q=v',
+        '/emptyquery',
+        '/dollar', '/asterisk',
+    ]
+    bad = [
+        '/a1/Z-._~', '/a1/%5A%2D%2E%5F%7E',
+        '/a2/Z-._~', '/a2/%5A%2D%2E%5F%7E',
+        '/u1/%F0%9F%90%8D', '/u1/%f0%9f%90%8d', '/u1/\U0001f40d',
+        '/u2/%F0%9F%90%8D', '/u2/%f0%9f%90%8d', '/u2/\U0001f40d',
+        '/u3/%F0%9F%90%8D', '/u3/%f0%9f%90%8d', '/u3/\U0001f40d',
+        '/v1/%F0', '/v1/%f0', '/v1/\udcf0', '/v1/\U0001f40d',
+        '/v2/%F0', '/v2/%f0', '/v2/\udcf0', '/v2/\U0001f40d',
+        '/v3/%F0', '/v3/%f0', '/v3/\udcf0', '/v3/\U0001f40d',
+        '/p1%xy', '/p1%25xy',
+        '/p2%', '/p2%25', '/p2%2525', '/p2%xy',
+        '/p3%xy', '/p3%25xy',
+        '/p4%2525xy',
+        '/john%20smith', '/john smith',
+        '/john%20doe', '/john doe',
+        '/trailingspace%20', '/trailingspace ',
+        '/query?q=v', '/question%3Fq=v',
+        '/query2?q=?', '/query2?q=%3F',
+        '/query3?q=?', '/query3?q=%3F',
+        '/emptyquery?', '/emptyquery?q=v',
+        '/hash#f', '/hash%23f',
+        '/dollar$', '/dollar%24',
+        '/asterisk*', '/asterisk%2A',
+        '/sub/dir', '/sub%2Fdir',
+        '/slash%2F', '/slash/',
+    ]
+    # other reserved characters
+    for c in ":/#[]@!$&'()*+,;=":
+        robots_txt += f'Disallow: /raw{c}\nDisallow: /pc%{ord(c):02X}\n'
+        bad.append(f'/raw{c}')
+        bad.append(f'/raw%{ord(c):02X}')
+        bad.append(f'/pc{c}')
+        bad.append(f'/pc%{ord(c):02X}')
 
 
 class DefaultEntryTest(BaseRequestRateTest, unittest.TestCase):
@@ -299,26 +369,17 @@ def test_string_formatting(self):
         self.assertEqual(str(self.parser), self.expected_output)
 
 
-class RobotHandler(BaseHTTPRequestHandler):
-
-    def do_GET(self):
-        self.send_error(403, "Forbidden access")
-
-    def log_message(self, format, *args):
-        pass
-
-
 @unittest.skipUnless(
     support.has_socket_support,
     "Socket server requires working socket."
 )
-class PasswordProtectedSiteTestCase(unittest.TestCase):
+class BaseLocalNetworkTestCase:
 
     def setUp(self):
         # clear _opener global variable
         self.addCleanup(urllib.request.urlcleanup)
 
-        self.server = HTTPServer((socket_helper.HOST, 0), RobotHandler)
+        self.server = HTTPServer((socket_helper.HOST, 0), self.RobotHandler)
 
         self.t = threading.Thread(
             name='HTTPServer serving',
@@ -335,6 +396,57 @@ def tearDown(self):
         self.t.join()
         self.server.server_close()
 
+
+SAMPLE_ROBOTS_TXT = b'''\
+User-agent: test_robotparser
+Disallow: /utf8/\xf0\x9f\x90\x8d
+Disallow: /non-utf8/\xf0
+Disallow: //[spam]/path
+'''
+
+
+class LocalNetworkTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
+    class RobotHandler(BaseHTTPRequestHandler):
+
+        def do_GET(self):
+            self.send_response(200)
+            self.end_headers()
+            self.wfile.write(SAMPLE_ROBOTS_TXT)
+
+        def log_message(self, format, *args):
+            pass
+
+    @threading_helper.reap_threads
+    def testRead(self):
+        # Test that reading a weird robots.txt doesn't fail.
+        addr = self.server.server_address
+        url = f'http://{socket_helper.HOST}:{addr[1]}'
+        robots_url = url + '/robots.txt'
+        parser = urllib.robotparser.RobotFileParser()
+        parser.set_url(robots_url)
+        parser.read()
+        # And it can even interpret the weird paths in some reasonable way.
+        agent = 'test_robotparser'
+        self.assertTrue(parser.can_fetch(agent, robots_url))
+        self.assertTrue(parser.can_fetch(agent, url + '/utf8/'))
+        self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
+        self.assertFalse(parser.can_fetch(agent, url + '/utf8/%F0%9F%90%8D'))
+        self.assertFalse(parser.can_fetch(agent, url + '/utf8/\U0001f40d'))
+        self.assertTrue(parser.can_fetch(agent, url + '/non-utf8/'))
+        self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/%F0'))
+        self.assertFalse(parser.can_fetch(agent, url + '/non-utf8/\U0001f40d'))
+        self.assertFalse(parser.can_fetch(agent, url + '/%2F[spam]/path'))
+
+
+class PasswordProtectedSiteTestCase(BaseLocalNetworkTestCase, unittest.TestCase):
+    class RobotHandler(BaseHTTPRequestHandler):
+
+        def do_GET(self):
+            self.send_error(403, "Forbidden access")
+
+        def log_message(self, format, *args):
+            pass
+
     @threading_helper.reap_threads
     def testPasswordProtectedSite(self):
         addr = self.server.server_address
diff --git a/Lib/urllib/robotparser.py b/Lib/urllib/robotparser.py
@@ -20,6 +20,18 @@
 RequestRate = collections.namedtuple("RequestRate", "requests seconds")
 
 
+def normalize(path):
+    unquoted = urllib.parse.unquote(path, errors='surrogateescape')
+    return urllib.parse.quote(unquoted, errors='surrogateescape')
+
+def normalize_path(path):
+    path, sep, query = path.partition('?')
+    path = normalize(path)
+    if sep:
+        path += '?' + normalize(query)
+    return path
+
+
 class RobotFileParser:
     """ This class provides a set of methods to read, parse and answer
     questions about a single robots.txt file.
@@ -55,7 +67,7 @@ def modified(self):
     def set_url(self, url):
         """Sets the URL referring to a robots.txt file."""
         self.url = url
-        self.host, self.path = urllib.parse.urlparse(url)[1:3]
+        self.host, self.path = urllib.parse.urlsplit(url)[1:3]
 
     def read(self):
         """Reads the robots.txt URL and feeds it to the parser."""
@@ -69,7 +81,7 @@ def read(self):
             err.close()
         else:
             raw = f.read()
-            self.parse(raw.decode("utf-8").splitlines())
+            self.parse(raw.decode("utf-8", "surrogateescape").splitlines())
 
     def _add_entry(self, entry):
         if "*" in entry.useragents:
@@ -113,7 +125,7 @@ def parse(self, lines):
             line = line.split(':', 1)
             if len(line) == 2:
                 line[0] = line[0].strip().lower()
-                line[1] = urllib.parse.unquote(line[1].strip())
+                line[1] = line[1].strip()
                 if line[0] == "user-agent":
                     if state == 2:
                         self._add_entry(entry)
@@ -167,10 +179,11 @@ def can_fetch(self, useragent, url):
             return False
         # search for given user agent matches
         # the first match counts
-        parsed_url = urllib.parse.urlparse(urllib.parse.unquote(url))
-        url = urllib.parse.urlunparse(('','',parsed_url.path,
-            parsed_url.params,parsed_url.query, parsed_url.fragment))
-        url = urllib.parse.quote(url)
+        # TODO: The private API is used in order to preserve an empty query.
+        # This is temporary until the public API starts supporting this feature.
+        parsed_url = urllib.parse._urlsplit(url, '')
+        url = urllib.parse._urlunsplit(None, None, *parsed_url[2:])
+        url = normalize_path(url)
         if not url:
             url = "/"
         for entry in self.entries:
@@ -213,16 +226,14 @@ def __str__(self):
             entries = entries + [self.default_entry]
         return '\n\n'.join(map(str, entries))
 
-
 class RuleLine:
     """A rule line is a single "Allow:" (allowance==True) or "Disallow:"
        (allowance==False) followed by a path."""
     def __init__(self, path, allowance):
         if path == '' and not allowance:
             # an empty value means allow all
             allowance = True
-        path = urllib.parse.urlunparse(urllib.parse.urlparse(path))
-        self.path = urllib.parse.quote(path)
+        self.path = normalize_path(path)
         self.allowance = allowance
 
     def applies_to(self, filename):
diff --git a/Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst b/Misc/NEWS.d/next/Library/2025-09-04-15-18-11.gh-issue-111788.tuTEM5.rst
@@ -0,0 +1,4 @@
+Fix parsing and normalization of the ``robots.txt`` rules and URLs in the
+:mod:`robotparser` module. Distinguish the query separator from
+a percent-encoded ``?``. Fix support of non-UTF-8 ``robots.txt`` files.
+Don't fail trying to parse weird paths.