Skip to content

Commit 1c2de8f

Browse files
committed
Add a sample_id column to ancestors
Starts to address #604. Also changes the sampledata format
1 parent a34f65a commit 1c2de8f

File tree

3 files changed

+89
-16
lines changed

3 files changed

+89
-16
lines changed

docs/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,7 @@
194194
intersphinx_mapping = {
195195
"https://docs.python.org/3/": None,
196196
"https://tskit.dev/tutorials": None,
197+
"https://numpy.org/doc/stable/": None,
197198
"https://tskit.dev/tskit/docs/stable": None,
198199
"https://tskit.dev/msprime/docs/stable": None,
199200
"https://numcodecs.readthedocs.io/en/stable/": None,

tests/test_formats.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2140,13 +2140,42 @@ def test_insert_proxy_1_sample(self):
21402140
sample_data, sample_ids=[i]
21412141
)
21422142
assert ancestors.num_ancestors + 1 == ancestors_extra.num_ancestors
2143-
inserted = -1
2143+
inserted = -1 # inserted one should always be the last
21442144
self.assert_ancestor_full_span(ancestors_extra, [inserted])
21452145
assert np.array_equal(
21462146
ancestors_extra.ancestors_haplotype[inserted],
21472147
sample_data.sites_genotypes[:, i][used_sites],
21482148
)
21492149

2150+
def test_insert_proxy_sample_ids(self):
2151+
ids = [1, 4]
2152+
sd, _ = self.get_example_data(10, 10, 40)
2153+
ancestors = tsinfer.generate_ancestors(sd).insert_proxy_samples(
2154+
sd, sample_ids=ids
2155+
)
2156+
inference_sites = np.isin(sd.sites_position[:], ancestors.sites_position[:])
2157+
anc_sample_ids = ancestors.ancestors_sample_id[:]
2158+
assert np.sum(anc_sample_ids != tskit.NULL) == len(ids)
2159+
for sample_id in ids:
2160+
assert np.sum(anc_sample_ids == sample_id) == 1
2161+
anc_sample = np.where(anc_sample_ids == sample_id)[0][0]
2162+
assert ancestors.ancestors_start[anc_sample] == 0
2163+
assert ancestors.ancestors_end[anc_sample] == ancestors.num_sites
2164+
assert len(ancestors.ancestors_focal_sites[anc_sample]) == 0
2165+
2166+
haplotype = next(sd.haplotypes([sample_id], sites=inference_sites))[1]
2167+
assert np.all(ancestors.ancestors_haplotype[anc_sample] == haplotype)
2168+
2169+
def test_insert_proxy_different_sample_data(self):
2170+
ids = [1, 4]
2171+
sd, _ = self.get_example_data(10, 10, 40)
2172+
ancestors = tsinfer.generate_ancestors(sd)
2173+
sd_copy, _ = self.get_example_data(10, 10, num_ancestors=40)
2174+
ancestors_extra = ancestors.insert_proxy_samples(
2175+
sd_copy, sample_ids=ids, require_same_sample_data=False
2176+
)
2177+
assert np.all(ancestors_extra.ancestors_sample_id[:] == tskit.NULL)
2178+
21502179
def test_insert_proxy_sample_provenance(self):
21512180
sample_data, _ = self.get_example_data(10, 10, 40)
21522181
ancestors = tsinfer.generate_ancestors(sample_data)
@@ -2189,6 +2218,8 @@ def test_insert_proxy_time_historical_samples(self):
21892218
assert np.array_equal(
21902219
ancestors_extra.ancestors_time[-1], historical_sample_time + epsilon
21912220
)
2221+
assert np.sum(ancestors_extra.ancestors_sample_id[:] != tskit.NULL) == 1
2222+
assert ancestors_extra.ancestors_sample_id[-1] == 9
21922223

21932224
# Test 2 proxies, one historical, specified in different ways / orders
21942225
s_ids = np.array([9, 0])
@@ -2210,6 +2241,9 @@ def test_insert_proxy_time_historical_samples(self):
22102241
ancestors_extra.ancestors_haplotype[-1], G[:, 0][used_sites]
22112242
)
22122243
assert np.array_equal(ancestors_extra.ancestors_time[-1], epsilon)
2244+
assert np.sum(ancestors_extra.ancestors_sample_id[:] != tskit.NULL) == 2
2245+
assert ancestors_extra.ancestors_sample_id[-2] == 9
2246+
assert ancestors_extra.ancestors_sample_id[-1] == 0
22132247

22142248
def test_insert_proxy_sample_epsilon(self):
22152249
sample_data, _ = self.get_example_data(10, 10, 40)

tsinfer/formats.py

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2028,6 +2028,10 @@ def haplotypes(self, samples=None, sites=None):
20282028
``None``, return haplotypes for all sample nodes, otherwise this may be a
20292029
numpy array (or array-like) object (converted to dtype=np.int32).
20302030
:param array sites: A numpy array of sites to use.
2031+
2032+
2033+
:return: An iterator returning sucessive instances of (sample_id, haplotype).
2034+
:rtype: iter(int, numpy.ndarray(dtype=int8))
20312035
"""
20322036
if samples is None:
20332037
samples = np.arange(self.num_samples)
@@ -2120,6 +2124,7 @@ class Ancestor:
21202124
time = attr.ib()
21212125
focal_sites = attr.ib()
21222126
haplotype = attr.ib()
2127+
sample_id = attr.ib()
21232128

21242129

21252130
class AncestorData(DataContainer):
@@ -2157,7 +2162,7 @@ class AncestorData(DataContainer):
21572162
"""
21582163

21592164
FORMAT_NAME = "tsinfer-ancestor-data"
2160-
FORMAT_VERSION = (3, 0)
2165+
FORMAT_VERSION = (3, 1)
21612166

21622167
def __init__(self, sample_data, **kwargs):
21632168
super().__init__(**kwargs)
@@ -2218,6 +2223,13 @@ def __init__(self, sample_data, **kwargs):
22182223
compressor=self._compressor,
22192224
fill_value=None,
22202225
)
2226+
self.data.create_dataset(
2227+
"ancestors/sample_id",
2228+
shape=(0,),
2229+
chunks=chunks,
2230+
compressor=self._compressor,
2231+
dtype=np.int32,
2232+
)
22212233

22222234
self._alloc_ancestor_writer()
22232235

@@ -2233,6 +2245,7 @@ def _alloc_ancestor_writer(self):
22332245
"time": self.ancestors_time,
22342246
"focal_sites": self.ancestors_focal_sites,
22352247
"haplotype": self.ancestors_haplotype,
2248+
"sample_id": self.ancestors_sample_id,
22362249
},
22372250
num_threads=self._num_flush_threads,
22382251
)
@@ -2254,6 +2267,7 @@ def __str__(self):
22542267
("ancestors/time", zarr_summary(self.ancestors_time)),
22552268
("ancestors/focal_sites", zarr_summary(self.ancestors_focal_sites)),
22562269
("ancestors/haplotype", zarr_summary(self.ancestors_haplotype)),
2270+
("ancestors/sample_id", zarr_summary(self.ancestors_sample_id)),
22572271
]
22582272
return super().__str__() + self._format_str(values)
22592273

@@ -2278,6 +2292,9 @@ def data_equal(self, other):
22782292
self.ancestors_focal_sites[:], other.ancestors_focal_sites[:]
22792293
)
22802294
and np_obj_equal(self.ancestors_haplotype[:], other.ancestors_haplotype[:])
2295+
and np.array_equal(
2296+
self.ancestors_sample_id[:], other.ancestors_sample_id[:]
2297+
)
22812298
)
22822299

22832300
@property
@@ -2320,6 +2337,10 @@ def ancestors_focal_sites(self):
23202337
def ancestors_haplotype(self):
23212338
return self.data["ancestors/haplotype"]
23222339

2340+
@property
2341+
def ancestors_sample_id(self):
2342+
return self.data["ancestors/sample_id"]
2343+
23232344
@property
23242345
def ancestors_length(self):
23252346
"""
@@ -2338,6 +2359,7 @@ def insert_proxy_samples(
23382359
*,
23392360
sample_ids=None,
23402361
epsilon=None,
2362+
map_ancestors=False,
23412363
allow_mutation=False,
23422364
require_same_sample_data=True,
23432365
**kwargs,
@@ -2350,7 +2372,8 @@ def insert_proxy_samples(
23502372
23512373
A *proxy sample ancestor* is an ancestor based upon a known sample. At
23522374
sites used in the full inference process, the haplotype of this ancestor
2353-
is identical to that of the sample on which it is based. The time of the
2375+
is identical to that of the sample on which it is based, and the
2376+
The time of the
23542377
ancestor is taken to be a fraction ``epsilon`` older than the sample on
23552378
which it is based.
23562379
@@ -2364,11 +2387,11 @@ def insert_proxy_samples(
23642387
23652388
.. note::
23662389
2367-
The proxy sample ancestors inserted here will correspond to extra nodes
2368-
in the inferred tree sequence. At sites which are not used in the full
2390+
The proxy sample ancestors inserted here will end up as extra nodes
2391+
in the inferred tree sequence, but at sites which are not used in the full
23692392
inference process (e.g. sites unique to a single historical sample),
2370-
these proxy sample ancestor nodes may have a different genotype from
2371-
their corresponding sample.
2393+
it is possible for these proxy sample ancestor nodes to have a different
2394+
genotype from their corresponding sample.
23722395
23732396
:param SampleData sample_data: The :class:`.SampleData` instance
23742397
from which to select the samples used to create extra ancestors.
@@ -2403,7 +2426,8 @@ def insert_proxy_samples(
24032426
to ensure that the encoding of alleles in ``sample_data`` matches the
24042427
encoding in the current :class:`AncestorData` instance (i.e. that in the
24052428
original :class:`.SampleData` instance on which the current ancestors
2406-
are based).
2429+
are based). Note that in this case, the sample_id is not recorded in the
2430+
returned object.
24072431
:param \\**kwargs: Further arguments passed to the constructor when creating
24082432
the new :class:`AncestorData` instance which will be returned.
24092433
@@ -2501,7 +2525,11 @@ def insert_proxy_samples(
25012525
time=proxy_time,
25022526
focal_sites=[],
25032527
haplotype=haplotype,
2528+
sample_id=sample_id
2529+
if sample_data.uuid == self.sample_data_uuid
2530+
else tskit.NULL,
25042531
)
2532+
25052533
# Add any ancestors remaining in the current instance
25062534
while ancestor is not None:
25072535
other.add_ancestor(**attr.asdict(ancestor, filter=exclude_id))
@@ -2583,7 +2611,6 @@ def truncate_ancestors(
25832611
start = self.ancestors_start[:]
25842612
end = self.ancestors_end[:]
25852613
time = self.ancestors_time[:]
2586-
focal_sites = self.ancestors_focal_sites[:]
25872614
haplotypes = self.ancestors_haplotype[:]
25882615
if upper_time_bound > np.max(time) or lower_time_bound > np.max(time):
25892616
raise ValueError("Time bounds cannot be greater than older ancestor")
@@ -2621,16 +2648,12 @@ def truncate_ancestors(
26212648
)
26222649
start[anc.id] = insert_pos_start
26232650
end[anc.id] = insert_pos_end
2624-
time[anc.id] = anc.time
2625-
focal_sites[anc.id] = anc.focal_sites
26262651
haplotypes[anc.id] = anc.haplotype[
26272652
insert_pos_start - anc.start : insert_pos_end - anc.start
26282653
]
26292654
# TODO - record truncation in ancestors' metadata when supported
26302655
truncated.ancestors_start[:] = start
26312656
truncated.ancestors_end[:] = end
2632-
truncated.ancestors_time[:] = time
2633-
truncated.ancestors_focal_sites[:] = focal_sites
26342657
truncated.ancestors_haplotype[:] = haplotypes
26352658
truncated.record_provenance(command="truncate_ancestors")
26362659
truncated.finalise()
@@ -2651,6 +2674,12 @@ def set_inference_sites(self, site_ids):
26512674
sites in the sample data file, and the IDs must be in increasing order.
26522675
26532676
This must be called before the first call to :meth:`.add_ancestor`.
2677+
2678+
.. note::
2679+
To obtain a list of which sites in a sample data or a tree sequence have
2680+
been placed into the ancestors file for use in inference, you can apply
2681+
:func:`numpy.isin` to the list of positions, e.g.
2682+
``np.isin(sample_data.sites_position[:], ancestors.sites_position[:])``
26542683
"""
26552684
self._check_build_mode()
26562685
position = self.sample_data.sites_position[:][site_ids]
@@ -2659,12 +2688,18 @@ def set_inference_sites(self, site_ids):
26592688
array[:] = position
26602689
self._num_alleles = self.sample_data.num_alleles(site_ids)
26612690

2662-
def add_ancestor(self, start, end, time, focal_sites, haplotype):
2691+
def add_ancestor(
2692+
self, start, end, time, focal_sites, haplotype, sample_id=tskit.NULL
2693+
):
26632694
"""
26642695
Adds an ancestor with the specified haplotype, with ancestral material over the
26652696
interval [start:end], that is associated with the specified timepoint and has new
2666-
mutations at the specified list of focal sites. Ancestors should be added in time
2667-
order, with the oldest first. The id of the added ancestor is returned.
2697+
mutations at the specified list of focal sites. If this ancestor is based on a
2698+
specific sample from the associated sample_data file (i.e. a historical sample)
2699+
then the ``sample_id`` in the sample data file can also be passed as a parameter.
2700+
2701+
The Ancestors should be added in time order, with the oldest first. The id of
2702+
the added ancestor is returned.
26682703
"""
26692704
self._check_build_mode()
26702705
haplotype = tskit.util.safe_np_int_cast(haplotype, dtype=np.int8, copy=True)
@@ -2694,6 +2729,7 @@ def add_ancestor(self, start, end, time, focal_sites, haplotype):
26942729
time=time,
26952730
focal_sites=focal_sites,
26962731
haplotype=haplotype,
2732+
sample_id=sample_id,
26972733
)
26982734

26992735
def finalise(self):
@@ -2715,6 +2751,7 @@ def ancestors(self):
27152751
end = self.ancestors_end[:]
27162752
time = self.ancestors_time[:]
27172753
focal_sites = self.ancestors_focal_sites[:]
2754+
sample_id = self.ancestors_sample_id[:]
27182755
for j, h in enumerate(chunk_iterator(self.ancestors_haplotype)):
27192756
yield Ancestor(
27202757
id=j,
@@ -2723,6 +2760,7 @@ def ancestors(self):
27232760
time=time[j],
27242761
focal_sites=focal_sites[j],
27252762
haplotype=h,
2763+
sample_id=sample_id[j],
27262764
)
27272765

27282766

0 commit comments

Comments
 (0)