Skip to content

gh-117779: Fix reading duplicated entries in zipfile by name #129254

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 106 additions & 2 deletions Lib/test/test_zipfile/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -2415,7 +2415,36 @@ def test_decompress_without_3rd_party_library(self):
self.assertRaises(RuntimeError, zf.extract, 'a.txt')

@requires_zlib()
def test_full_overlap(self):
def test_full_overlap_different_names(self):
data = (
b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2\x1e'
b'8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00b\xed'
b'\xc0\x81\x08\x00\x00\x00\xc00\xd6\xfbK\\d\x0b`P'
b'K\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2'
b'\x1e8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00\x00'
b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00aPK'
b'\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2\x1e'
b'8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00\x00\x00'
b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00bPK\x05'
b'\x06\x00\x00\x00\x00\x02\x00\x02\x00^\x00\x00\x00/\x00\x00'
b'\x00\x00\x00'
Comment on lines +2419 to +2430
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally, binary data like this should be accompanied by an explanation of what it represents and ideally how it was constructed. Should this test start failing, how will someone know what this zip file represents?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I'll add comments. Please see #137152.

The data was created manually. I created simple ZIP files using the zip command, and then edited them -- removed or moved the data, changed file names, updated offsets. I tested how it worked with unpatched zipfile (all was read successfully).

Only the data for test_quoted_overlap was generated by a script provided in the original report, and then simplified.

)
with zipfile.ZipFile(io.BytesIO(data), 'r') as zipf:
self.assertEqual(zipf.namelist(), ['a', 'b'])
zi = zipf.getinfo('a')
self.assertEqual(zi.header_offset, 0)
self.assertEqual(zi.compress_size, 16)
self.assertEqual(zi.file_size, 1033)
zi = zipf.getinfo('b')
self.assertEqual(zi.header_offset, 0)
self.assertEqual(zi.compress_size, 16)
self.assertEqual(zi.file_size, 1033)
self.assertEqual(len(zipf.read('b')), 1033)
with self.assertRaisesRegex(zipfile.BadZipFile, 'File name.*differ'):
zipf.read('a')

@requires_zlib()
def test_full_overlap_different_names2(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would have given this function a different name, something that distinguishes it from the non-2 version... or at the very least add a docstring explaining what it's asserting. Future readers are going to have a hard time discerning the motivations for these two tests based on null vs. 2.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These tests are very similar, the only difference is what central directory name matches the local header name -- the first or the second. I hope it will be clear from the comments.

data = (
b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2\x1e'
b'8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00a\xed'
Expand All @@ -2439,9 +2468,43 @@ def test_full_overlap(self):
self.assertEqual(zi.header_offset, 0)
self.assertEqual(zi.compress_size, 16)
self.assertEqual(zi.file_size, 1033)
self.assertEqual(len(zipf.read('a')), 1033)
with self.assertRaisesRegex(zipfile.BadZipFile, 'File name.*differ'):
zipf.read('b')
with self.assertWarnsRegex(UserWarning, 'Overlapped entries') as cm:
self.assertEqual(len(zipf.read('a')), 1033)
self.assertEqual(cm.filename, __file__)

@requires_zlib()
def test_full_overlap_same_name(self):
data = (
b'PK\x03\x04\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2\x1e'
b'8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00a\xed'
b'\xc0\x81\x08\x00\x00\x00\xc00\xd6\xfbK\\d\x0b`P'
b'K\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2'
b'\x1e8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00\x00'
b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00aPK'
b'\x01\x02\x14\x00\x14\x00\x00\x00\x08\x00\xa0lH\x05\xe2\x1e'
b'8\xbb\x10\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00\x00\x00'
b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00aPK\x05'
b'\x06\x00\x00\x00\x00\x02\x00\x02\x00^\x00\x00\x00/\x00\x00'
b'\x00\x00\x00'
)
with zipfile.ZipFile(io.BytesIO(data), 'r') as zipf:
self.assertEqual(zipf.namelist(), ['a', 'a'])
self.assertEqual(len(zipf.infolist()), 2)
zi = zipf.getinfo('a')
self.assertEqual(zi.header_offset, 0)
self.assertEqual(zi.compress_size, 16)
self.assertEqual(zi.file_size, 1033)
self.assertEqual(len(zipf.read('a')), 1033)
self.assertEqual(len(zipf.read(zi)), 1033)
self.assertEqual(len(zipf.read(zipf.infolist()[1])), 1033)
with self.assertWarnsRegex(UserWarning, 'Overlapped entries') as cm:
self.assertEqual(len(zipf.read(zipf.infolist()[0])), 1033)
self.assertEqual(cm.filename, __file__)
with self.assertWarnsRegex(UserWarning, 'Overlapped entries') as cm:
zipf.open(zipf.infolist()[0]).close()
self.assertEqual(cm.filename, __file__)

@requires_zlib()
def test_quoted_overlap(self):
Expand Down Expand Up @@ -2474,6 +2537,47 @@ def test_quoted_overlap(self):
zipf.read('a')
self.assertEqual(len(zipf.read('b')), 1033)

@requires_zlib()
def test_overlap_with_central_dir(self):
data = (
b'PK\x01\x02\x14\x03\x14\x00\x00\x00\x08\x00G_|Z'
b'\xe2\x1e8\xbb\x0b\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00'
b'\x00\x00\x00\x00\x00\x00\x00\x00\xb4\x81\x00\x00\x00\x00aP'
b'K\x05\x06\x00\x00\x00\x00\x01\x00\x01\x00/\x00\x00\x00\x00'
b'\x00\x00\x00\x00\x00'
)
with zipfile.ZipFile(io.BytesIO(data), 'r') as zipf:
self.assertEqual(zipf.namelist(), ['a'])
self.assertEqual(len(zipf.infolist()), 1)
zi = zipf.getinfo('a')
self.assertEqual(zi.header_offset, 0)
self.assertEqual(zi.compress_size, 11)
self.assertEqual(zi.file_size, 1033)
with self.assertRaisesRegex(zipfile.BadZipFile, 'Bad magic number'):
zipf.read('a')

@requires_zlib()
def test_overlap_with_archive_comment(self):
data = (
b'PK\x01\x02\x14\x03\x14\x00\x00\x00\x08\x00G_|Z'
b'\xe2\x1e8\xbb\x0b\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00'
b'\x00\x00\x00\x00\x00\x00\x00\x00\xb4\x81E\x00\x00\x00aP'
b'K\x05\x06\x00\x00\x00\x00\x01\x00\x01\x00/\x00\x00\x00\x00'
b'\x00\x00\x00*\x00'
b'PK\x03\x04\x14\x00\x00\x00\x08\x00G_|Z\xe2\x1e'
b'8\xbb\x0b\x00\x00\x00\t\x04\x00\x00\x01\x00\x00\x00aK'
b'L\x1c\x05\xa3`\x14\x8cx\x00\x00'
)
with zipfile.ZipFile(io.BytesIO(data), 'r') as zipf:
self.assertEqual(zipf.namelist(), ['a'])
self.assertEqual(len(zipf.infolist()), 1)
zi = zipf.getinfo('a')
self.assertEqual(zi.header_offset, 69)
self.assertEqual(zi.compress_size, 11)
self.assertEqual(zi.file_size, 1033)
with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'):
zipf.read('a')

def tearDown(self):
unlink(TESTFN)
unlink(TESTFN2)
Expand Down
16 changes: 12 additions & 4 deletions Lib/zipfile/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1545,9 +1545,8 @@ def _RealGetContents(self):
print("total", total)

end_offset = self.start_dir
for zinfo in sorted(self.filelist,
key=lambda zinfo: zinfo.header_offset,
reverse=True):
for zinfo in reversed(sorted(self.filelist,
key=lambda zinfo: zinfo.header_offset)):
zinfo._end_offset = end_offset
end_offset = zinfo.header_offset

Expand Down Expand Up @@ -1709,7 +1708,16 @@ def open(self, name, mode="r", pwd=None, *, force_zip64=False):

if (zinfo._end_offset is not None and
zef_file.tell() + zinfo.compress_size > zinfo._end_offset):
raise BadZipFile(f"Overlapped entries: {zinfo.orig_filename!r} (possible zip bomb)")
if zinfo._end_offset == zinfo.header_offset:
import warnings
warnings.warn(
f"Overlapped entries: {zinfo.orig_filename!r} "
f"(possible zip bomb)",
skip_file_prefixes=(os.path.dirname(__file__),))
else:
raise BadZipFile(
f"Overlapped entries: {zinfo.orig_filename!r} "
f"(possible zip bomb)")

# check for encrypted flag & handle password
is_encrypted = zinfo.flag_bits & _MASK_ENCRYPTED
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fix reading duplicated entries in :mod:`zipfile` by name.
Reading duplicated entries (except the last one) by ``ZipInfo``
now emits a warning instead of raising an exception.
Loading