|
23 | 23 | Compression utilities for tskit tree sequences.
|
24 | 24 | """
|
25 | 25 | import contextlib
|
| 26 | +import functools |
26 | 27 | import json
|
27 | 28 | import logging
|
28 |
| -import os.path |
| 29 | +import os |
| 30 | +import pathlib |
29 | 31 | import tempfile
|
30 | 32 | import warnings
|
31 | 33 | import zipfile
|
@@ -71,32 +73,43 @@ def minimal_dtype(array):
|
71 | 73 | return dtype
|
72 | 74 |
|
73 | 75 |
|
74 |
| -def compress(ts, path, variants_only=False): |
| 76 | +def compress(ts, destination, variants_only=False): |
75 | 77 | """
|
76 |
| - Compresses the specified tree sequence and writes it to the specified path. |
77 |
| - By default, fully lossless compression is used so that tree sequences are |
78 |
| - identical before and after compression. By specifying the ``variants_only`` |
79 |
| - option, a lossy compression can be used, which discards any information |
80 |
| - that is not needed to represent the variants (which are stored losslessly). |
| 78 | + Compresses the specified tree sequence and writes it to the specified path |
| 79 | + or file-like object. By default, fully lossless compression is used so that |
| 80 | + tree sequences are identical before and after compression. By specifying |
| 81 | + the ``variants_only`` option, a lossy compression can be used, which |
| 82 | + discards any information that is not needed to represent the variants |
| 83 | + (which are stored losslessly). |
81 | 84 |
|
82 | 85 | :param tskit.TreeSequence ts: The input tree sequence.
|
83 |
| - :param str destination: The string or :class:`pathlib.Path` instance describing |
84 |
| - the location of the compressed file. |
| 86 | + :param str destination: The string, :class:`pathlib.Path` or file-like object |
| 87 | + we should write the compressed file to. |
85 | 88 | :param bool variants_only: If True, discard all information not necessary
|
86 | 89 | to represent the variants in the input file.
|
87 | 90 | """
|
88 |
| - destination = str(path) |
89 |
| - # Write the file into a temporary directory on the same file system so that |
90 |
| - # we can write the output atomically. |
91 |
| - destdir = os.path.dirname(os.path.abspath(destination)) |
| 91 | + try: |
| 92 | + destination = pathlib.Path(destination).resolve() |
| 93 | + is_path = True |
| 94 | + destdir = destination.parent |
| 95 | + except TypeError: |
| 96 | + is_path = False |
| 97 | + destdir = None |
92 | 98 | with tempfile.TemporaryDirectory(dir=destdir, prefix=".tszip_work_") as tmpdir:
|
93 |
| - filename = os.path.join(tmpdir, "tmp.trees.tgz") |
| 99 | + filename = pathlib.Path(tmpdir, "tmp.trees.tgz") |
94 | 100 | logging.debug(f"Writing to temporary file {filename}")
|
95 | 101 | with zarr.ZipStore(filename, mode="w") as store:
|
96 | 102 | root = zarr.group(store=store)
|
97 | 103 | compress_zarr(ts, root, variants_only=variants_only)
|
98 |
| - os.replace(filename, destination) |
99 |
| - logging.info(f"Wrote {destination}") |
| 104 | + if is_path: |
| 105 | + os.replace(filename, destination) |
| 106 | + logging.info(f"Wrote {destination}") |
| 107 | + else: |
| 108 | + # Assume that destination is a file-like object open in "wb" mode. |
| 109 | + with open(filename, "rb") as source: |
| 110 | + chunk_size = 2 ** 10 # 1MiB |
| 111 | + for chunk in iter(functools.partial(source.read, chunk_size), b""): |
| 112 | + destination.write(chunk) |
100 | 113 |
|
101 | 114 |
|
102 | 115 | def decompress(path):
|
@@ -275,7 +288,7 @@ def load_zarr(path):
|
275 | 288 | try:
|
276 | 289 | store = zarr.ZipStore(path, mode="r")
|
277 | 290 | except zipfile.BadZipFile as bzf:
|
278 |
| - raise exceptions.FileFormatError("File is not in tgzip format") from bzf |
| 291 | + raise exceptions.FileFormatError("File is not in tszip format") from bzf |
279 | 292 | root = zarr.group(store=store)
|
280 | 293 | try:
|
281 | 294 | check_format(root)
|
|
0 commit comments