Skip to content

Commit 73fe8a0

Browse files
aabiddandamergify[bot]
authored andcommitted
parent 340315a
author Arjun Biddanda <[email protected]> 1636730041 -0500 committer Arjun Biddanda <[email protected]> 1645130969 -0500 parent 340315a author Arjun Biddanda <[email protected]> 1636730041 -0500 committer Arjun Biddanda <[email protected]> 1645130930 -0500 feature: tszip compression to stdout
1 parent 340315a commit 73fe8a0

File tree

3 files changed

+67
-32
lines changed

3 files changed

+67
-32
lines changed

tests/test_cli.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,17 @@
3636
import tszip.cli as cli
3737

3838

39+
def get_stdout_for_pytest():
40+
"""
41+
Pytest automatically wraps stdout to intercept output, but the object
42+
that it uses isn't fully compatible with the production implementation.
43+
Specifically, it doesn't provide a "buffer" attribute, which we
44+
need when writing binary data to it. This is a workaround to make
45+
out tests work.
46+
"""
47+
return sys.stdout
48+
49+
3950
class TestException(Exception):
4051
"""
4152
Custom exception we can throw for testing.
@@ -284,15 +295,22 @@ def test_bad_file_format(self):
284295
f" command."
285296
)
286297

287-
def test_compress_stdout(self):
298+
def test_compress_stdout_keep(self):
288299
self.assertTrue(self.trees_path.exists())
289-
with mock.patch("tszip.cli.exit", side_effect=TestException) as mocked_exit:
290-
with self.assertRaises(TestException):
291-
self.run_tszip([str(self.trees_path)] + ["-c"])
292-
mocked_exit.assert_called_once_with(
293-
"Compressing to stdout not currently supported;"
294-
"Please see https://github.com/tskit-dev/tszip/issues/49"
295-
)
300+
with mock.patch("tszip.cli.get_stdout", wraps=get_stdout_for_pytest):
301+
self.run_tszip_stdout([str(self.trees_path)] + ["-c"])
302+
self.assertTrue(self.trees_path.exists())
303+
304+
def test_compress_stdout_correct(self):
305+
self.assertTrue(self.trees_path.exists())
306+
tmp_file = pathlib.Path(self.tmpdir.name) / "stdout.trees"
307+
with mock.patch("tszip.cli.get_stdout", wraps=get_stdout_for_pytest):
308+
stdout, stderr = self.run_tszip_stdout(["-c", str(self.trees_path)])
309+
with open(tmp_file, "wb+") as tmp:
310+
tmp.write(stdout)
311+
self.assertTrue(tmp_file.exists())
312+
ts = tszip.decompress(str(tmp_file))
313+
self.assertEqual(ts.tables, self.ts.tables)
296314

297315

298316
class DecompressSemanticsMixin:
@@ -416,7 +434,7 @@ def test_bad_file_format(self):
416434
with self.assertRaises(TestException):
417435
self.run_decompress([str(self.compressed_path)])
418436
mocked_exit.assert_called_once_with(
419-
"Error reading '{}': File is not in tgzip format".format(
437+
"Error reading '{}': File is not in tszip format".format(
420438
self.compressed_path
421439
)
422440
)
@@ -485,7 +503,7 @@ def test_bad_file_format(self):
485503
with self.assertRaises(TestException):
486504
cli.tszip_main([str(self.compressed_path), "-l"])
487505
mocked_exit.assert_called_once_with(
488-
"Error reading '{}': File is not in tgzip format".format(
506+
"Error reading '{}': File is not in tszip format".format(
489507
self.compressed_path
490508
)
491509
)

tszip/cli.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,16 +98,18 @@ def remove_input(infile, args):
9898

9999
def check_output(outfile, args):
100100
if outfile.exists():
101-
if not args.force:
101+
if not (args.force or args.stdout):
102102
exit(f"'{outfile}' already exists; use --force to overwrite")
103103

104104

105+
# Allows us to easily patch when running tests.
106+
def get_stdout():
107+
return sys.stdout.buffer
108+
109+
105110
def run_compress(args):
106111
if args.stdout:
107-
exit(
108-
"Compressing to stdout not currently supported;"
109-
"Please see https://github.com/tskit-dev/tszip/issues/49"
110-
)
112+
args.keep = True
111113
setup_logging(args)
112114
for file_arg in args.files:
113115
logger.info(f"Compressing {file_arg}")
@@ -119,6 +121,8 @@ def run_compress(args):
119121
infile = pathlib.Path(file_arg)
120122
outfile = pathlib.Path(file_arg + args.suffix)
121123
check_output(outfile, args)
124+
if args.stdout:
125+
outfile = get_stdout()
122126
tszip.compress(ts, outfile, variants_only=args.variants_only)
123127
remove_input(infile, args)
124128

tszip/compression.py

Lines changed: 30 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,11 @@
2323
Compression utilities for tskit tree sequences.
2424
"""
2525
import contextlib
26+
import functools
2627
import json
2728
import logging
28-
import os.path
29+
import os
30+
import pathlib
2931
import tempfile
3032
import warnings
3133
import zipfile
@@ -71,32 +73,43 @@ def minimal_dtype(array):
7173
return dtype
7274

7375

74-
def compress(ts, path, variants_only=False):
76+
def compress(ts, destination, variants_only=False):
7577
"""
76-
Compresses the specified tree sequence and writes it to the specified path.
77-
By default, fully lossless compression is used so that tree sequences are
78-
identical before and after compression. By specifying the ``variants_only``
79-
option, a lossy compression can be used, which discards any information
80-
that is not needed to represent the variants (which are stored losslessly).
78+
Compresses the specified tree sequence and writes it to the specified path
79+
or file-like object. By default, fully lossless compression is used so that
80+
tree sequences are identical before and after compression. By specifying
81+
the ``variants_only`` option, a lossy compression can be used, which
82+
discards any information that is not needed to represent the variants
83+
(which are stored losslessly).
8184
8285
:param tskit.TreeSequence ts: The input tree sequence.
83-
:param str destination: The string or :class:`pathlib.Path` instance describing
84-
the location of the compressed file.
86+
:param str destination: The string, :class:`pathlib.Path` or file-like object
87+
we should write the compressed file to.
8588
:param bool variants_only: If True, discard all information not necessary
8689
to represent the variants in the input file.
8790
"""
88-
destination = str(path)
89-
# Write the file into a temporary directory on the same file system so that
90-
# we can write the output atomically.
91-
destdir = os.path.dirname(os.path.abspath(destination))
91+
try:
92+
destination = pathlib.Path(destination).resolve()
93+
is_path = True
94+
destdir = destination.parent
95+
except TypeError:
96+
is_path = False
97+
destdir = None
9298
with tempfile.TemporaryDirectory(dir=destdir, prefix=".tszip_work_") as tmpdir:
93-
filename = os.path.join(tmpdir, "tmp.trees.tgz")
99+
filename = pathlib.Path(tmpdir, "tmp.trees.tgz")
94100
logging.debug(f"Writing to temporary file {filename}")
95101
with zarr.ZipStore(filename, mode="w") as store:
96102
root = zarr.group(store=store)
97103
compress_zarr(ts, root, variants_only=variants_only)
98-
os.replace(filename, destination)
99-
logging.info(f"Wrote {destination}")
104+
if is_path:
105+
os.replace(filename, destination)
106+
logging.info(f"Wrote {destination}")
107+
else:
108+
# Assume that destination is a file-like object open in "wb" mode.
109+
with open(filename, "rb") as source:
110+
chunk_size = 2 ** 10 # 1MiB
111+
for chunk in iter(functools.partial(source.read, chunk_size), b""):
112+
destination.write(chunk)
100113

101114

102115
def decompress(path):
@@ -275,7 +288,7 @@ def load_zarr(path):
275288
try:
276289
store = zarr.ZipStore(path, mode="r")
277290
except zipfile.BadZipFile as bzf:
278-
raise exceptions.FileFormatError("File is not in tgzip format") from bzf
291+
raise exceptions.FileFormatError("File is not in tszip format") from bzf
279292
root = zarr.group(store=store)
280293
try:
281294
check_format(root)

0 commit comments

Comments
 (0)