@@ -2028,6 +2028,10 @@ def haplotypes(self, samples=None, sites=None):
2028
2028
``None``, return haplotypes for all sample nodes, otherwise this may be a
2029
2029
numpy array (or array-like) object (converted to dtype=np.int32).
2030
2030
:param array sites: A numpy array of sites to use.
2031
+
2032
+
2033
+ :return: An iterator returning sucessive instances of (sample_id, haplotype).
2034
+ :rtype: iter(int, numpy.ndarray(dtype=int8))
2031
2035
"""
2032
2036
if samples is None :
2033
2037
samples = np .arange (self .num_samples )
@@ -2120,6 +2124,7 @@ class Ancestor:
2120
2124
time = attr .ib ()
2121
2125
focal_sites = attr .ib ()
2122
2126
haplotype = attr .ib ()
2127
+ sample_id = attr .ib ()
2123
2128
2124
2129
2125
2130
class AncestorData (DataContainer ):
@@ -2157,7 +2162,7 @@ class AncestorData(DataContainer):
2157
2162
"""
2158
2163
2159
2164
FORMAT_NAME = "tsinfer-ancestor-data"
2160
- FORMAT_VERSION = (3 , 0 )
2165
+ FORMAT_VERSION = (3 , 1 )
2161
2166
2162
2167
def __init__ (self , sample_data , ** kwargs ):
2163
2168
super ().__init__ (** kwargs )
@@ -2218,6 +2223,13 @@ def __init__(self, sample_data, **kwargs):
2218
2223
compressor = self ._compressor ,
2219
2224
fill_value = None ,
2220
2225
)
2226
+ self .data .create_dataset (
2227
+ "ancestors/sample_id" ,
2228
+ shape = (0 ,),
2229
+ chunks = chunks ,
2230
+ compressor = self ._compressor ,
2231
+ dtype = np .int32 ,
2232
+ )
2221
2233
2222
2234
self ._alloc_ancestor_writer ()
2223
2235
@@ -2233,6 +2245,7 @@ def _alloc_ancestor_writer(self):
2233
2245
"time" : self .ancestors_time ,
2234
2246
"focal_sites" : self .ancestors_focal_sites ,
2235
2247
"haplotype" : self .ancestors_haplotype ,
2248
+ "sample_id" : self .ancestors_sample_id ,
2236
2249
},
2237
2250
num_threads = self ._num_flush_threads ,
2238
2251
)
@@ -2254,6 +2267,7 @@ def __str__(self):
2254
2267
("ancestors/time" , zarr_summary (self .ancestors_time )),
2255
2268
("ancestors/focal_sites" , zarr_summary (self .ancestors_focal_sites )),
2256
2269
("ancestors/haplotype" , zarr_summary (self .ancestors_haplotype )),
2270
+ ("ancestors/sample_id" , zarr_summary (self .ancestors_sample_id )),
2257
2271
]
2258
2272
return super ().__str__ () + self ._format_str (values )
2259
2273
@@ -2278,6 +2292,9 @@ def data_equal(self, other):
2278
2292
self .ancestors_focal_sites [:], other .ancestors_focal_sites [:]
2279
2293
)
2280
2294
and np_obj_equal (self .ancestors_haplotype [:], other .ancestors_haplotype [:])
2295
+ and np .array_equal (
2296
+ self .ancestors_sample_id [:], other .ancestors_sample_id [:]
2297
+ )
2281
2298
)
2282
2299
2283
2300
@property
@@ -2320,6 +2337,10 @@ def ancestors_focal_sites(self):
2320
2337
def ancestors_haplotype (self ):
2321
2338
return self .data ["ancestors/haplotype" ]
2322
2339
2340
+ @property
2341
+ def ancestors_sample_id (self ):
2342
+ return self .data ["ancestors/sample_id" ]
2343
+
2323
2344
@property
2324
2345
def ancestors_length (self ):
2325
2346
"""
@@ -2338,6 +2359,7 @@ def insert_proxy_samples(
2338
2359
* ,
2339
2360
sample_ids = None ,
2340
2361
epsilon = None ,
2362
+ map_ancestors = False ,
2341
2363
allow_mutation = False ,
2342
2364
require_same_sample_data = True ,
2343
2365
** kwargs ,
@@ -2350,7 +2372,8 @@ def insert_proxy_samples(
2350
2372
2351
2373
A *proxy sample ancestor* is an ancestor based upon a known sample. At
2352
2374
sites used in the full inference process, the haplotype of this ancestor
2353
- is identical to that of the sample on which it is based. The time of the
2375
+ is identical to that of the sample on which it is based, and the
2376
+ The time of the
2354
2377
ancestor is taken to be a fraction ``epsilon`` older than the sample on
2355
2378
which it is based.
2356
2379
@@ -2364,11 +2387,11 @@ def insert_proxy_samples(
2364
2387
2365
2388
.. note::
2366
2389
2367
- The proxy sample ancestors inserted here will correspond to extra nodes
2368
- in the inferred tree sequence. At sites which are not used in the full
2390
+ The proxy sample ancestors inserted here will end up as extra nodes
2391
+ in the inferred tree sequence, but at sites which are not used in the full
2369
2392
inference process (e.g. sites unique to a single historical sample),
2370
- these proxy sample ancestor nodes may have a different genotype from
2371
- their corresponding sample.
2393
+ it is possible for these proxy sample ancestor nodes to have a different
2394
+ genotype from their corresponding sample.
2372
2395
2373
2396
:param SampleData sample_data: The :class:`.SampleData` instance
2374
2397
from which to select the samples used to create extra ancestors.
@@ -2403,7 +2426,8 @@ def insert_proxy_samples(
2403
2426
to ensure that the encoding of alleles in ``sample_data`` matches the
2404
2427
encoding in the current :class:`AncestorData` instance (i.e. that in the
2405
2428
original :class:`.SampleData` instance on which the current ancestors
2406
- are based).
2429
+ are based). Note that in this case, the sample_id is not recorded in the
2430
+ returned object.
2407
2431
:param \\ **kwargs: Further arguments passed to the constructor when creating
2408
2432
the new :class:`AncestorData` instance which will be returned.
2409
2433
@@ -2501,7 +2525,11 @@ def insert_proxy_samples(
2501
2525
time = proxy_time ,
2502
2526
focal_sites = [],
2503
2527
haplotype = haplotype ,
2528
+ sample_id = sample_id
2529
+ if sample_data .uuid == self .sample_data_uuid
2530
+ else tskit .NULL ,
2504
2531
)
2532
+
2505
2533
# Add any ancestors remaining in the current instance
2506
2534
while ancestor is not None :
2507
2535
other .add_ancestor (** attr .asdict (ancestor , filter = exclude_id ))
@@ -2583,7 +2611,6 @@ def truncate_ancestors(
2583
2611
start = self .ancestors_start [:]
2584
2612
end = self .ancestors_end [:]
2585
2613
time = self .ancestors_time [:]
2586
- focal_sites = self .ancestors_focal_sites [:]
2587
2614
haplotypes = self .ancestors_haplotype [:]
2588
2615
if upper_time_bound > np .max (time ) or lower_time_bound > np .max (time ):
2589
2616
raise ValueError ("Time bounds cannot be greater than older ancestor" )
@@ -2621,16 +2648,12 @@ def truncate_ancestors(
2621
2648
)
2622
2649
start [anc .id ] = insert_pos_start
2623
2650
end [anc .id ] = insert_pos_end
2624
- time [anc .id ] = anc .time
2625
- focal_sites [anc .id ] = anc .focal_sites
2626
2651
haplotypes [anc .id ] = anc .haplotype [
2627
2652
insert_pos_start - anc .start : insert_pos_end - anc .start
2628
2653
]
2629
2654
# TODO - record truncation in ancestors' metadata when supported
2630
2655
truncated .ancestors_start [:] = start
2631
2656
truncated .ancestors_end [:] = end
2632
- truncated .ancestors_time [:] = time
2633
- truncated .ancestors_focal_sites [:] = focal_sites
2634
2657
truncated .ancestors_haplotype [:] = haplotypes
2635
2658
truncated .record_provenance (command = "truncate_ancestors" )
2636
2659
truncated .finalise ()
@@ -2651,6 +2674,12 @@ def set_inference_sites(self, site_ids):
2651
2674
sites in the sample data file, and the IDs must be in increasing order.
2652
2675
2653
2676
This must be called before the first call to :meth:`.add_ancestor`.
2677
+
2678
+ .. note::
2679
+ To obtain a list of which sites in a sample data or a tree sequence have
2680
+ been placed into the ancestors file for use in inference, you can apply
2681
+ :func:`numpy.isin` to the list of positions, e.g.
2682
+ ``np.isin(sample_data.sites_position[:], ancestors.sites_position[:])``
2654
2683
"""
2655
2684
self ._check_build_mode ()
2656
2685
position = self .sample_data .sites_position [:][site_ids ]
@@ -2659,12 +2688,18 @@ def set_inference_sites(self, site_ids):
2659
2688
array [:] = position
2660
2689
self ._num_alleles = self .sample_data .num_alleles (site_ids )
2661
2690
2662
- def add_ancestor (self , start , end , time , focal_sites , haplotype ):
2691
+ def add_ancestor (
2692
+ self , start , end , time , focal_sites , haplotype , sample_id = tskit .NULL
2693
+ ):
2663
2694
"""
2664
2695
Adds an ancestor with the specified haplotype, with ancestral material over the
2665
2696
interval [start:end], that is associated with the specified timepoint and has new
2666
- mutations at the specified list of focal sites. Ancestors should be added in time
2667
- order, with the oldest first. The id of the added ancestor is returned.
2697
+ mutations at the specified list of focal sites. If this ancestor is based on a
2698
+ specific sample from the associated sample_data file (i.e. a historical sample)
2699
+ then the ``sample_id`` in the sample data file can also be passed as a parameter.
2700
+
2701
+ The Ancestors should be added in time order, with the oldest first. The id of
2702
+ the added ancestor is returned.
2668
2703
"""
2669
2704
self ._check_build_mode ()
2670
2705
haplotype = tskit .util .safe_np_int_cast (haplotype , dtype = np .int8 , copy = True )
@@ -2694,6 +2729,7 @@ def add_ancestor(self, start, end, time, focal_sites, haplotype):
2694
2729
time = time ,
2695
2730
focal_sites = focal_sites ,
2696
2731
haplotype = haplotype ,
2732
+ sample_id = sample_id ,
2697
2733
)
2698
2734
2699
2735
def finalise (self ):
@@ -2715,6 +2751,7 @@ def ancestors(self):
2715
2751
end = self .ancestors_end [:]
2716
2752
time = self .ancestors_time [:]
2717
2753
focal_sites = self .ancestors_focal_sites [:]
2754
+ sample_id = self .ancestors_sample_id [:]
2718
2755
for j , h in enumerate (chunk_iterator (self .ancestors_haplotype )):
2719
2756
yield Ancestor (
2720
2757
id = j ,
@@ -2723,6 +2760,7 @@ def ancestors(self):
2723
2760
time = time [j ],
2724
2761
focal_sites = focal_sites [j ],
2725
2762
haplotype = h ,
2763
+ sample_id = sample_id [j ],
2726
2764
)
2727
2765
2728
2766
0 commit comments