From 9548b9b7b03ec60dd7fe2cf96944d64ee484c282 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 2 Jun 2023 14:45:08 -0400 Subject: [PATCH 01/31] create Dynamics class --- dynamo/tools/dynamics.py | 674 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 674 insertions(+) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index c5b51c7cf..9fadb2f6d 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -52,6 +52,680 @@ warnings.simplefilter("ignore", SparseEfficiencyWarning) +class Dynamics: + def __init__( + self, + adata: AnnData, + filter_gene_mode: Literal["final", "basic", "no"] = "final", + use_smoothed: bool = True, + assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", + assumption_protein: Literal["ss"] = "ss", + model: Literal["auto", "deterministic", "stochastic"] = "auto", + est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", + NTR_vel: bool = False, + group: Optional[str] = None, + protein_names: Optional[List[str]] = None, + concat_data: bool = False, + log_unnormalized: bool = True, + one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", + fraction_for_deg: bool = False, + re_smooth: bool = False, + sanity_check: bool = False, + del_2nd_moments: Optional[bool] = None, + cores: int = 1, + tkey: str = None, + **est_kwargs, + ): + if "pp" not in adata.uns_keys(): + raise ValueError(f"\nPlease run `dyn.pp.receipe_monocle(adata)` before running this function!") + self.adata = adata + self.filter_gene_mode = filter_gene_mode + self.use_smoothed = use_smoothed + self.assumption_mRNA = assumption_mRNA + self.assumption_protein = assumption_protein + self.model = model + self.est_method = est_method + self.NTR_vel = NTR_vel + self.group = group + self.protein_names = protein_names + self.concat_data = concat_data + self.log_unnormalized = log_unnormalized + self.one_shot_method = one_shot_method + self.fraction_for_deg = fraction_for_deg + self.re_smooth = re_smooth + self.sanity_check = sanity_check + self.del_2nd_moments = DynamoAdataConfig.use_default_var_if_none( + del_2nd_moments, DynamoAdataConfig.DYNAMICS_DEL_2ND_MOMENTS_KEY + ) + self.cores = cores + if tkey is not None: + if adata.obs[tkey].max() > 60: + main_warning( + "Looks like you are using minutes as the time unit. For the purpose of numeric stability, " + "we recommend using hour as the time unit." + ) + self.tkey = adata.uns["pp"]["tkey"] if tkey is None else tkey + self.est_kwargs = est_kwargs + + def check_model(self, model): + pass + + def _calc_vel_utils_ss(self, vel, U1, S1, U2, S2): + if self.has_splicing: + if self.experiment_type == "kin": + Kc = np.clip(self.gamma[:, None], 0, 1 - 1e-3) # S - U slope + gamma_ = -(np.log(1 - Kc) / self.t[None, :]) # actual gamma + + vel_U = U2.multiply(csr_matrix(gamma_ / Kc)) - csr_matrix(self.beta).multiply( + U1) # vel.vel_s(U_) + vel_S = vel.vel_s(U1, S1) + + vel_N = (U2 - csr_matrix(Kc).multiply(U2)).multiply( + csr_matrix(gamma_ / Kc)) # vel.vel_u(U) + # scale back to true velocity via multiplying "gamma_ / Kc". + vel_T = (U2 - csr_matrix(Kc).multiply(S2)).multiply(csr_matrix(gamma_ / Kc)) + elif self.experiment_type == "mix_std_stm": + # steady state RNA: u0, stimulation RNA: u_new; + # cell-wise transcription rate under simulation: alpha1 + u0, u_new, alpha1 = solve_alpha_2p_mat( + t0=np.max(self.t) - self.t, + t1=self.t, + alpha0=self.alpha[0], + beta=self.beta, + u1=U2, + ) + vel_U = alpha1 - csr_matrix(self.beta[:, None]).multiply(U1) + vel_S = vel.vel_s(U1, S1) + + vel_N = alpha1 - csr_matrix(self.gamma[:, None]).multiply(u_new) + vel_T = alpha1 - csr_matrix(self.beta[:, None]).multiply(S2) + else: + vel_U = vel.vel_u(U1) + vel_S = vel.vel_s(U1, S1) + vel_N = vel.vel_u(U2) + vel_T = vel.vel_s(U2, S2 - U2) # need to consider splicing + else: + if self.experiment_type == "kin": + vel_U = np.nan + vel_S = np.nan + + Kc = np.clip(self.gamma[:, None], 0, 1 - 1e-3) # S - U slope + gamma_ = -(np.log(1 - Kc) / self.t[None, :]) # actual gamma + vel_N = (U2 - csr_matrix(Kc).multiply(U2)).multiply( + csr_matrix(gamma_ / Kc)) # vel.vel_u(U) + # scale back to true velocity via multiplying "gamma_ / Kc". + vel_T = (U2 - csr_matrix(Kc).multiply(S2)).multiply(csr_matrix(gamma_ / Kc)) + elif self.experiment_type == "mix_std_stm": + vel_U = np.nan + vel_S = np.nan + + # steady state RNA: u0, stimulation RNA: u_new; + # cell-wise transcription rate under simulation: alpha1 + u0, u_new, alpha1 = solve_alpha_2p_mat( + t0=np.max(self.t) - self.t, + t1=self.t, + alpha0=self.alpha[0], + beta=self.gamma, + u1=self.U, + ) + + vel_N = alpha1 - csr_matrix(self.gamma[:, None]).multiply(u_new) + vel_T = alpha1 - csr_matrix(self.gamma[:, None]).multiply(S2) + else: + vel_U = np.nan + vel_S = np.nan + vel_N = vel.vel_u(U2) + vel_T = vel.vel_u(S2) # don't consider splicing + return vel_U, vel_S, vel_N, vel_T + + def _calc_vel_utils_kin(self, vel, U1, S1, U2, S2): + if self.has_splicing: + if self.experiment_type == "kin": + vel_U = vel.vel_u(U1) + vel_S = vel.vel_s(U1, S1) + vel.parameters["beta"] = self.gamma + vel_N = vel.vel_u(U2) + vel_T = vel.vel_u(S2) # no need to consider splicing + elif self.experiment_type == "deg": + if self.splicing_labeling: + vel_U = np.nan + vel_S = vel.vel_s(U1, S1) + vel_N = np.nan + vel_T = np.nan + else: + vel_U = np.nan + vel_S = vel.vel_s(U1, S1) + vel_N = np.nan + vel_T = np.nan + elif self.experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: + vel_U = vel.vel_u(U1, repeat=True) + vel_S = vel.vel_s(U1, S1) + vel.parameters["beta"] = self.gamma + vel_N = vel.vel_u(U2, repeat=True) + vel_T = vel.vel_u(S2, repeat=True) # no need to consider splicing + else: + if self.experiment_type == "kin": + vel_U = np.nan + vel_S = np.nan + + # calculate cell-wise alpha, if est_method is twostep, this can be skipped + alpha_ = one_shot_alpha_matrix(U2, self.gamma, self.t) + + vel.parameters["alpha"] = alpha_ + + vel_N = vel.vel_u(U2) + vel_T = vel.vel_u(S2) # don't consider splicing + elif self.experiment_type == "deg": + vel_U = np.nan + vel_S = np.nan + vel_N = np.nan + vel_T = np.nan + elif self.experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: + vel_U = np.nan + vel_S = np.nan + vel_N = vel.vel_u(U2, repeat=True) + # TODO: figure out whether we need repeat here + vel_T = vel.vel_u(S2, repeat=True) # don't consider splicing + return vel_U, vel_S, vel_N, vel_T + + def _calc_vel_utils(self, vel, vel_func, U, S, U_, S_): + if self.NTR_vel: + vel_U, vel_S, vel_N, vel_T = vel_func(vel=vel, U1=U_, S1=S_, U2=U, S2=S) + else: + vel_U, vel_S, vel_N, vel_T = vel_func(vel=vel, U1=U, S1=S, U2=U_, S2=S_) + return vel_U, vel_S, vel_N, vel_T + + def calculate_velocity_ss(self): + U, S = get_U_S_for_velocity_estimation( + self.subset_adata, + self.use_smoothed, + self.has_splicing, + self.has_labeling, + self.log_unnormalized, + self.NTR_vel, + ) + vel = Velocity(estimation=self.est) + + if self.experiment_type.lower() in [ + "one_shot", + "one-shot", + "kin", + "mix_std_stm", + ]: + U_, S_ = get_U_S_for_velocity_estimation( + self.subset_adata, + self.use_smoothed, + self.has_splicing, + self.has_labeling, + self.log_unnormalized, + not self.NTR_vel, + ) + vel_U, vel_S, vel_N, vel_T = self._calc_vel_utils(vel=vel, vel_func=self._calc_vel_utils_ss, U=U, S=S, U_=U_, S_=S_) + else: + vel_U = vel.vel_u(U) + vel_S = vel.vel_s(U, S) + vel_N, vel_T = np.nan, np.nan + + vel_P = vel.vel_p(S, self.P) + + return vel_U, vel_S, vel_N, vel_T, vel_P + + def calculate_velocity_kin(self): + # if alpha = None, set alpha to be U; N - gamma R + params = {"alpha": self.alpha, "beta": self.beta, "gamma": self.gamma, "t": self.t} + vel = Velocity(**params) + # Fix below: + U, S = get_U_S_for_velocity_estimation( + self.subset_adata, + self.use_smoothed, + self.has_splicing, + self.has_labeling, + self.log_unnormalized, + self.NTR_vel, + ) + + U_, S_ = get_U_S_for_velocity_estimation( + self.subset_adata, + self.use_smoothed, + self.has_splicing, + self.has_labeling, + self.log_unnormalized, + not self.NTR_vel, + ) + vel_U, vel_S, vel_N, vel_T = self._calc_vel_utils(vel=vel, vel_func=self._calc_vel_utils_kin, U=U, S=S, U_=U_, S_=S_) + + vel_P = vel.vel_p(S, self.P) + + return vel_U, vel_S, vel_N, vel_T, vel_P + + def set_velocity_ss(self, vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre): + self.adata = set_velocity( + self.adata, + vel_U, + vel_S, + vel_N, + vel_T, + vel_P, + self._group, + cur_grp, + cur_cells_bools, + valid_bools_, + self.ind_for_proteins, + ) + + self.adata = set_param_ss( + self.adata, + self.est, + self.alpha, + self.beta, + self.gamma, + self.eta, + self.delta, + self.experiment_type, + self._group, + cur_grp, + kin_param_pre, + valid_bools_, + self.ind_for_proteins, + ) + + def set_velocity_kin(self, vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre, extra_params): + self.adata = set_velocity( + self.adata, + vel_U, + vel_S, + vel_N, + vel_T, + vel_P, + self._group, + cur_grp, + cur_cells_bools, + valid_bools_, + self.ind_for_proteins, + ) + + self.adata = set_param_kinetic( + self.adata, + self.alpha, + self.a, + self.b, + self.alpha_a, + self.alpha_i, + self.beta, + self.gamma, + self.cost, + self.logLL, + kin_param_pre, + extra_params, + self._group, + cur_grp, + cur_cells_bools, + valid_bools_, + ) + + def estimate_vel_calc_params_ss(self): + if self.est_method.lower() == "auto": + self.est_method = "gmm" if self.model.lower() == "stochastic" else "ols" + + if self.experiment_type.lower() == "one-shot": + self.beta = self.subset_adata.var.beta if "beta" in self.subset_adata.var.keys() else None + self.gamma = self.subset_adata.var.gamma if "gamma" in self.subset_adata.var.keys() else None + ss_estimation_kwargs = {"beta": self.beta, "gamma": self.gamma} + else: + ss_estimation_kwargs = {} + + self.est = ss_estimation( + U=self.U.copy() if self.U is not None else None, + Ul=self.Ul.copy() if self.Ul is not None else None, + S=self.S.copy() if self.S is not None else None, + Sl=self.Sl.copy() if self.Sl is not None else None, + P=self.P.copy() if self.P is not None else None, + US=self.US.copy() if self.US is not None else None, + S2=self.S2.copy() if self.S2 is not None else None, + conn=self.subset_adata.obsp["moments_con"], + t=self.t, + ind_for_proteins=self.ind_for_proteins, + model=self.model, + est_method=self.est_method, + experiment_type=self.experiment_type, + assumption_mRNA=self.assumption_mRNA, + assumption_protein=self.assumption_protein, + concat_data=self.concat_data, + cores=self.cores, + **ss_estimation_kwargs, + ) # U: (unlabeled) unspliced; S: (unlabeled) spliced; U / Ul: old and labeled; U, Ul, S, Sl: uu/ul/su/sl + + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + + if self.experiment_type.lower() in ["one-shot", "one_shot"]: + self.est.fit(one_shot_method=self.one_shot_method, **self.est_kwargs) + else: + # experiment_type can be `kin` also and by default use + # conventional method to estimate k but correct for time + self.est.fit(**self.est_kwargs) + + self.alpha, self.beta, self.gamma, self.eta, self.delta = self.est.parameters.values() + + def estimate_vel_calc_params_kin(self, cur_grp_i, cur_grp): + return_ntr = True if self.fraction_for_deg and self.experiment_type.lower() == "deg" else False + + if self.model_was_auto and self.experiment_type.lower() == "kin": + self.model = "mixture" + if self.est_method == "auto": + self.est_method = "direct" + data_type = "smoothed" if self.use_smoothed else "sfs" + + (params, half_life, self.cost, self.logLL, param_ranges, cur_X_data, cur_X_fit_data,) = kinetic_model( + self.subset_adata, + self.tkey, + self.model, + self.est_method, + self.experiment_type, + self.has_splicing, + self.splicing_labeling, + has_switch=True, + param_rngs={}, + data_type=data_type, + return_ntr=return_ntr, + **self.est_kwargs, + ) + + if type(params) == dict: + self.alpha = params.pop("alpha") + params = pd.DataFrame(params) + else: + self.alpha = params.loc[:, "alpha"].values if "alpha" in params.columns else None + + len_t, len_g = len(np.unique(self.t)), len(self._group) + if cur_grp == self._group[0]: + if len_g != 1: + # X_data, X_fit_data = np.zeros((len_g, adata.n_vars, len_t)), np.zeros((len_g, adata.n_vars,len_t)) + X_data, X_fit_data = [None] * len_g, [None] * len_g + + if len(self._group) == 1: + X_data, X_fit_data = cur_X_data, cur_X_fit_data + else: + # X_data[cur_grp_i, :, :], X_fit_data[cur_grp_i, :, :] = cur_X_data, cur_X_fit_data + X_data[cur_grp_i], X_fit_data[cur_grp_i] = ( + cur_X_data, + cur_X_fit_data, + ) + + self.a, self.b, self.alpha_a, self.alpha_i, self.beta, self.gamma = ( + params.loc[:, "a"].values if "a" in params.columns else None, + params.loc[:, "b"].values if "b" in params.columns else None, + params.loc[:, "alpha_a"].values if "alpha_a" in params.columns else None, + params.loc[:, "alpha_i"].values if "alpha_i" in params.columns else None, + params.loc[:, "beta"].values if "beta" in params.columns else None, + params.loc[:, "gamma"].values if "gamma" in params.columns else None, + ) + if self.alpha is None: + self.alpha = fbar(self.a, self.b, self.alpha_a, 0) if self.alpha_i is None else fbar(self.a, self.b, self.alpha_a, self.alpha_i) + all_kinetic_params = [ + "a", + "b", + "alpha_a", + "alpha_i", + "alpha", + "beta", + "gamma", + ] + + extra_params = params.loc[:, params.columns.difference(all_kinetic_params)] + return extra_params + + def dynamics_ss(self, cur_grp_i, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre): + self.estimate_vel_calc_params_ss() + vel_U, vel_S, vel_N, vel_T, vel_P = self.calculate_velocity_ss() + self.set_velocity_ss(vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre) + + def dynamics_kin(self, cur_grp_i, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre): + extra_params = self.estimate_vel_calc_params_kin(cur_grp_i=cur_grp_i, cur_grp=cur_grp) + vel_U, vel_S, vel_N, vel_T, vel_P = self.calculate_velocity_kin() + self.set_velocity_kin(vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre, extra_params) + + def estimate(self): + (self.experiment_type, self.has_splicing, self.has_labeling, self.splicing_labeling, self.has_protein,) = ( + self.adata.uns["pp"]["experiment_type"], + self.adata.uns["pp"]["has_splicing"], + self.adata.uns["pp"]["has_labeling"], + self.adata.uns["pp"]["splicing_labeling"], + self.adata.uns["pp"]["has_protein"], + ) + + X_data, X_fit_data = None, None + filter_list, filter_gene_mode_list = ( + [ + "use_for_pca", + "pass_basic_filter", + "no", + ], + ["final", "basic", "no"], + ) + filter_checker = [i in self.adata.var.columns for i in filter_list[:2]] + filter_checker.append(True) + filter_id = filter_gene_mode_list.index(self.filter_gene_mode) + which_filter = np.where(filter_checker[filter_id:])[0][0] + filter_id + + filter_gene_mode = filter_gene_mode_list[which_filter] + + valid_bools = get_valid_bools(self.adata, filter_gene_mode) + gene_num = sum(valid_bools) + if gene_num == 0: + raise Exception(f"no genes pass filter. Try resetting `filter_gene_mode = 'no'` to use all genes.") + + if self.model.lower() == "auto": + self.model = "stochastic" + self.model_was_auto = True + else: + self.model_was_auto = False + + if self.tkey is not None: + if self.adata.obs[self.tkey].max() > 60: + main_warning( + "Looks like you are using minutes as the time unit. For the purpose of numeric stability, " + "we recommend using hour as the time unit." + ) + + if self.model.lower() == "stochastic" or self.use_smoothed or self.re_smooth: + M_layers = [i for i in self.adata.layers.keys() if i.startswith("M_")] + + if len(M_layers) < 2 or self.re_smooth: + main_info("removing existing M layers:%s..." % (str(list(M_layers))), indent_level=2) + for i in M_layers: + del self.adata.layers[i] + main_info("making adata smooth...", indent_level=2) + + if self.group is not None and self.group in self.adata.obs.columns: + moments(self.adata, genes=valid_bools, group=self.group) + else: + moments(self.adata, genes=valid_bools, group=self.tkey) + elif self.tkey is not None: + main_warning( + f"You used tkey {self.tkey} (or group {self.group}), but you have calculated local smoothing (1st moment) " + f"for your data before. Please ensure you used the desired tkey or group when the smoothing was " + f"performed. Try setting re_smooth = True if not sure." + ) + + valid_adata = self.adata[:, valid_bools].copy() + if self.group is not None and self.group in self.adata.obs.columns: + self._group = self.adata.obs[self.group].unique() + if any(self.adata.obs[self.group].value_counts() < 50): + main_warning( + f"Note that some groups have less than 50 cells, this may lead to the velocities for some " + f"cells are all NaN values and cause issues for all downstream analysis. Please try to " + f"coarse-grain cell groupings. Cell number for each group are {self.adata.obs[self.group].value_counts()}" + ) + + else: + self._group = ["_all_cells"] + + for cur_grp_i, cur_grp in enumerate(self._group): + if cur_grp == "_all_cells": + kin_param_pre = "" + cur_cells_bools = np.ones(valid_adata.shape[0], dtype=bool) + self.subset_adata = valid_adata[cur_cells_bools] + else: + kin_param_pre = str(self.group) + "_" + str(cur_grp) + "_" + cur_cells_bools = (valid_adata.obs[self.group] == cur_grp).values + self.subset_adata = valid_adata[cur_cells_bools] + + if self.model.lower() == "stochastic" or self.use_smoothed: + moments(self.subset_adata) + ( + self.U, + self.Ul, + self.S, + self.Sl, + self.P, + self.US, + self.U2, + self.S2, + self.t, + self.normalized, + self.ind_for_proteins, + assump_mRNA, + ) = get_data_for_kin_params_estimation( + self.subset_adata, + self.has_splicing, + self.has_labeling, + self.model, + self.use_smoothed, + self.tkey, + self.protein_names, + self.log_unnormalized, + self.NTR_vel, + ) + + valid_bools_ = valid_bools.copy() + if self.sanity_check and self.experiment_type.lower() in ["kin", "deg"]: + indices_valid_bools = np.where(valid_bools)[0] + t, L = ( + t.flatten(), + (0 if self.Ul is None else self.Ul) + (0 if self.Sl is None else self.Sl), + ) + t_uniq = np.unique(t) + + valid_gene_checker = np.zeros(gene_num, dtype=bool) + for L_iter, cur_L in tqdm( + enumerate(L), + desc=f"sanity check of {self.experiment_type} experiment data:", + ): + cur_L = cur_L.A.flatten() if issparse(cur_L) else cur_L.flatten() + y = strat_mom(cur_L, t, np.nanmean) + slope, _ = fit_linreg(t_uniq, y, intercept=True, r2=False) + valid_gene_checker[L_iter] = ( + True + if (slope > 0 and self.experiment_type == "kin") or (slope < 0 and self.experiment_type == "deg") + else False + ) + valid_bools_[indices_valid_bools[~valid_gene_checker]] = False + main_warning(f"filtering {gene_num - valid_gene_checker.sum()} genes after sanity check.") + + if len(valid_bools_) < 5: + raise Exception( + f"After sanity check, you have less than 5 valid genes. Something is wrong about your " + f"metabolic labeling experiment!" + ) + + self.U, self.Ul, self.S, self.Sl = ( + (None if self.U is None else self.U[valid_gene_checker, :]), + (None if self.Ul is None else self.Ul[valid_gene_checker, :]), + (None if self.S is None else self.S[valid_gene_checker, :]), + (None if self.Sl is None else self.Sl[valid_gene_checker, :]), + ) + self.subset_adata = self.subset_adata[:, valid_gene_checker] + self.adata.var[kin_param_pre + "sanity_check"] = valid_bools_ + + if self.assumption_mRNA.lower() == "auto": + self.assumption_mRNA = assump_mRNA + if self.experiment_type.lower() == "conventional": + self.assumption_mRNA = "ss" + elif self.experiment_type.lower() in ["mix_pulse_chase", "deg", "kin"]: + self.assumption_mRNA = "kinetic" + + if self.model.lower() == "stochastic" and self.experiment_type.lower() not in [ + "conventional", + "kinetics", + "degradation", + "kin", + "deg", + "one-shot", + ]: + """ + # temporially convert to deterministic model as moment model for mix_std_stm + and other types of labeling experiment is ongoing.""" + + self.model = "deterministic" + + if self.model_was_auto and self.experiment_type.lower() in [ + "kinetic", + "kin", + "degradation", + "deg", + ]: + self.model = "deterministic" + + if self.assumption_mRNA.lower() == "ss" or (self.experiment_type.lower() in ["one-shot", "mix_std_stm"]): + self.dynamics_ss( + cur_grp_i=cur_grp_i, + cur_grp=cur_grp, + cur_cells_bools=cur_cells_bools, + valid_bools_=valid_bools_, + kin_param_pre=kin_param_pre, + ) + elif self.assumption_mRNA.lower() == "kinetic": + self.dynamics_kin( + cur_grp_i=cur_grp_i, + cur_grp=cur_grp, + cur_cells_bools=cur_cells_bools, + valid_bools_=valid_bools_, + kin_param_pre=kin_param_pre, + ) + # add protein related parameters in the moment model below: + elif self.model.lower() == "model_selection": + main_warning("Not implemented yet.") + + if self.group is not None and self.group in self.adata.obs[self.group]: + uns_key = self.group + "_dynamics" + else: + uns_key = "dynamics" + + if self.sanity_check and self.experiment_type in ["kin", "deg"]: + sanity_check_cols = self.adata.var.columns.str.endswith("sanity_check") + self.adata.var["use_for_dynamics"] = self.adata.var.loc[:, sanity_check_cols].sum(1).astype(bool) + else: + self.adata.var["use_for_dynamics"] = False + self.adata.var.loc[valid_bools, "use_for_dynamics"] = True + + self.adata.uns[uns_key] = { + "filter_gene_mode": filter_gene_mode, + "t": self.t, + "group": self.group, + "X_data": X_data, + "X_fit_data": X_fit_data, + "asspt_mRNA": self.assumption_mRNA, + "experiment_type": self.experiment_type, + "normalized": self.normalized, + "model": self.model, + "est_method": self.est_method, + "has_splicing": self.has_splicing, + "has_labeling": self.has_labeling, + "splicing_labeling": self.splicing_labeling, + "has_protein": self.has_protein, + "use_smoothed": self.use_smoothed, + "NTR_vel": self.NTR_vel, + "log_unnormalized": self.log_unnormalized, + "fraction_for_deg": self.fraction_for_deg, + } + + if self.del_2nd_moments: + remove_2nd_moments(self.adata) + + return self.adata + + # incorporate the model selection code soon def dynamics( adata: AnnData, From bfc590f9396d7dd4a6fb496db831284162c623cf Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 2 Jun 2023 16:31:28 -0400 Subject: [PATCH 02/31] func to methods --- dynamo/tools/dynamics.py | 143 ++++++++++++++++++++------------------- 1 file changed, 72 insertions(+), 71 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 9fadb2f6d..f950e411d 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -83,7 +83,12 @@ def __init__( self.use_smoothed = use_smoothed self.assumption_mRNA = assumption_mRNA self.assumption_protein = assumption_protein - self.model = model + if model.lower() == "auto": + self.model = "stochastic" + self.model_was_auto = True + else: + self.model = model + self.model_was_auto = False self.est_method = est_method self.NTR_vel = NTR_vel self.group = group @@ -235,9 +240,9 @@ def _calc_vel_utils(self, vel, vel_func, U, S, U_, S_): vel_U, vel_S, vel_N, vel_T = vel_func(vel=vel, U1=U, S1=S, U2=U_, S2=S_) return vel_U, vel_S, vel_N, vel_T - def calculate_velocity_ss(self): + def calculate_velocity_ss(self, subset_adata): U, S = get_U_S_for_velocity_estimation( - self.subset_adata, + subset_adata, self.use_smoothed, self.has_splicing, self.has_labeling, @@ -253,7 +258,7 @@ def calculate_velocity_ss(self): "mix_std_stm", ]: U_, S_ = get_U_S_for_velocity_estimation( - self.subset_adata, + subset_adata, self.use_smoothed, self.has_splicing, self.has_labeling, @@ -270,13 +275,13 @@ def calculate_velocity_ss(self): return vel_U, vel_S, vel_N, vel_T, vel_P - def calculate_velocity_kin(self): + def calculate_velocity_kin(self, subset_adata): # if alpha = None, set alpha to be U; N - gamma R params = {"alpha": self.alpha, "beta": self.beta, "gamma": self.gamma, "t": self.t} vel = Velocity(**params) # Fix below: U, S = get_U_S_for_velocity_estimation( - self.subset_adata, + subset_adata, self.use_smoothed, self.has_splicing, self.has_labeling, @@ -285,7 +290,7 @@ def calculate_velocity_kin(self): ) U_, S_ = get_U_S_for_velocity_estimation( - self.subset_adata, + subset_adata, self.use_smoothed, self.has_splicing, self.has_labeling, @@ -363,13 +368,13 @@ def set_velocity_kin(self, vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells valid_bools_, ) - def estimate_vel_calc_params_ss(self): + def estimate_vel_calc_params_ss(self, subset_adata): if self.est_method.lower() == "auto": self.est_method = "gmm" if self.model.lower() == "stochastic" else "ols" if self.experiment_type.lower() == "one-shot": - self.beta = self.subset_adata.var.beta if "beta" in self.subset_adata.var.keys() else None - self.gamma = self.subset_adata.var.gamma if "gamma" in self.subset_adata.var.keys() else None + self.beta = subset_adata.var.beta if "beta" in subset_adata.var.keys() else None + self.gamma = subset_adata.var.gamma if "gamma" in subset_adata.var.keys() else None ss_estimation_kwargs = {"beta": self.beta, "gamma": self.gamma} else: ss_estimation_kwargs = {} @@ -382,7 +387,7 @@ def estimate_vel_calc_params_ss(self): P=self.P.copy() if self.P is not None else None, US=self.US.copy() if self.US is not None else None, S2=self.S2.copy() if self.S2 is not None else None, - conn=self.subset_adata.obsp["moments_con"], + conn=subset_adata.obsp["moments_con"], t=self.t, ind_for_proteins=self.ind_for_proteins, model=self.model, @@ -407,7 +412,7 @@ def estimate_vel_calc_params_ss(self): self.alpha, self.beta, self.gamma, self.eta, self.delta = self.est.parameters.values() - def estimate_vel_calc_params_kin(self, cur_grp_i, cur_grp): + def estimate_vel_calc_params_kin(self, cur_grp_i, cur_grp, subset_adata): return_ntr = True if self.fraction_for_deg and self.experiment_type.lower() == "deg" else False if self.model_was_auto and self.experiment_type.lower() == "kin": @@ -417,7 +422,7 @@ def estimate_vel_calc_params_kin(self, cur_grp_i, cur_grp): data_type = "smoothed" if self.use_smoothed else "sfs" (params, half_life, self.cost, self.logLL, param_ranges, cur_X_data, cur_X_fit_data,) = kinetic_model( - self.subset_adata, + subset_adata, self.tkey, self.model, self.est_method, @@ -441,13 +446,13 @@ def estimate_vel_calc_params_kin(self, cur_grp_i, cur_grp): if cur_grp == self._group[0]: if len_g != 1: # X_data, X_fit_data = np.zeros((len_g, adata.n_vars, len_t)), np.zeros((len_g, adata.n_vars,len_t)) - X_data, X_fit_data = [None] * len_g, [None] * len_g + self.X_data, self.X_fit_data = [None] * len_g, [None] * len_g if len(self._group) == 1: - X_data, X_fit_data = cur_X_data, cur_X_fit_data + self.X_data, self.X_fit_data = cur_X_data, cur_X_fit_data else: # X_data[cur_grp_i, :, :], X_fit_data[cur_grp_i, :, :] = cur_X_data, cur_X_fit_data - X_data[cur_grp_i], X_fit_data[cur_grp_i] = ( + self.X_data[cur_grp_i], self.X_fit_data[cur_grp_i] = ( cur_X_data, cur_X_fit_data, ) @@ -475,26 +480,17 @@ def estimate_vel_calc_params_kin(self, cur_grp_i, cur_grp): extra_params = params.loc[:, params.columns.difference(all_kinetic_params)] return extra_params - def dynamics_ss(self, cur_grp_i, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre): - self.estimate_vel_calc_params_ss() - vel_U, vel_S, vel_N, vel_T, vel_P = self.calculate_velocity_ss() + def dynamics_ss(self, cur_grp_i, cur_grp, subset_adata, cur_cells_bools, valid_bools_, kin_param_pre): + self.estimate_vel_calc_params_ss(subset_adata=subset_adata) + vel_U, vel_S, vel_N, vel_T, vel_P = self.calculate_velocity_ss(subset_adata=subset_adata) self.set_velocity_ss(vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre) - def dynamics_kin(self, cur_grp_i, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre): - extra_params = self.estimate_vel_calc_params_kin(cur_grp_i=cur_grp_i, cur_grp=cur_grp) - vel_U, vel_S, vel_N, vel_T, vel_P = self.calculate_velocity_kin() + def dynamics_kin(self, cur_grp_i, cur_grp, subset_adata, cur_cells_bools, valid_bools_, kin_param_pre): + extra_params = self.estimate_vel_calc_params_kin(cur_grp_i=cur_grp_i, cur_grp=cur_grp, subset_adata=subset_adata) + vel_U, vel_S, vel_N, vel_T, vel_P = self.calculate_velocity_kin(subset_adata=subset_adata) self.set_velocity_kin(vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre, extra_params) - def estimate(self): - (self.experiment_type, self.has_splicing, self.has_labeling, self.splicing_labeling, self.has_protein,) = ( - self.adata.uns["pp"]["experiment_type"], - self.adata.uns["pp"]["has_splicing"], - self.adata.uns["pp"]["has_labeling"], - self.adata.uns["pp"]["splicing_labeling"], - self.adata.uns["pp"]["has_protein"], - ) - - X_data, X_fit_data = None, None + def filter(self): filter_list, filter_gene_mode_list = ( [ "use_for_pca", @@ -514,39 +510,42 @@ def estimate(self): gene_num = sum(valid_bools) if gene_num == 0: raise Exception(f"no genes pass filter. Try resetting `filter_gene_mode = 'no'` to use all genes.") + return filter_gene_mode, valid_bools, gene_num - if self.model.lower() == "auto": - self.model = "stochastic" - self.model_was_auto = True - else: - self.model_was_auto = False + def smooth(self, valid_bools): + M_layers = [i for i in self.adata.layers.keys() if i.startswith("M_")] - if self.tkey is not None: - if self.adata.obs[self.tkey].max() > 60: - main_warning( - "Looks like you are using minutes as the time unit. For the purpose of numeric stability, " - "we recommend using hour as the time unit." - ) + if len(M_layers) < 2 or self.re_smooth: + main_info("removing existing M layers:%s..." % (str(list(M_layers))), indent_level=2) + for i in M_layers: + del self.adata.layers[i] + main_info("making adata smooth...", indent_level=2) - if self.model.lower() == "stochastic" or self.use_smoothed or self.re_smooth: - M_layers = [i for i in self.adata.layers.keys() if i.startswith("M_")] + if self.group is not None and self.group in self.adata.obs.columns: + moments(self.adata, genes=valid_bools, group=self.group) + else: + moments(self.adata, genes=valid_bools, group=self.tkey) + elif self.tkey is not None: + main_warning( + f"You used tkey {self.tkey} (or group {self.group}), but you have calculated local smoothing (1st moment) " + f"for your data before. Please ensure you used the desired tkey or group when the smoothing was " + f"performed. Try setting re_smooth = True if not sure." + ) + + def estimate(self): + (self.experiment_type, self.has_splicing, self.has_labeling, self.splicing_labeling, self.has_protein,) = ( + self.adata.uns["pp"]["experiment_type"], + self.adata.uns["pp"]["has_splicing"], + self.adata.uns["pp"]["has_labeling"], + self.adata.uns["pp"]["splicing_labeling"], + self.adata.uns["pp"]["has_protein"], + ) - if len(M_layers) < 2 or self.re_smooth: - main_info("removing existing M layers:%s..." % (str(list(M_layers))), indent_level=2) - for i in M_layers: - del self.adata.layers[i] - main_info("making adata smooth...", indent_level=2) + self.X_data, self.X_fit_data = None, None + filter_gene_mode, valid_bools, gene_num = self.filter() - if self.group is not None and self.group in self.adata.obs.columns: - moments(self.adata, genes=valid_bools, group=self.group) - else: - moments(self.adata, genes=valid_bools, group=self.tkey) - elif self.tkey is not None: - main_warning( - f"You used tkey {self.tkey} (or group {self.group}), but you have calculated local smoothing (1st moment) " - f"for your data before. Please ensure you used the desired tkey or group when the smoothing was " - f"performed. Try setting re_smooth = True if not sure." - ) + if self.model.lower() == "stochastic" or self.use_smoothed or self.re_smooth: + self.smooth(valid_bools=valid_bools) valid_adata = self.adata[:, valid_bools].copy() if self.group is not None and self.group in self.adata.obs.columns: @@ -565,14 +564,14 @@ def estimate(self): if cur_grp == "_all_cells": kin_param_pre = "" cur_cells_bools = np.ones(valid_adata.shape[0], dtype=bool) - self.subset_adata = valid_adata[cur_cells_bools] + subset_adata = valid_adata[cur_cells_bools] else: kin_param_pre = str(self.group) + "_" + str(cur_grp) + "_" cur_cells_bools = (valid_adata.obs[self.group] == cur_grp).values - self.subset_adata = valid_adata[cur_cells_bools] + subset_adata = valid_adata[cur_cells_bools] if self.model.lower() == "stochastic" or self.use_smoothed: - moments(self.subset_adata) + moments(subset_adata) ( self.U, self.Ul, @@ -587,7 +586,7 @@ def estimate(self): self.ind_for_proteins, assump_mRNA, ) = get_data_for_kin_params_estimation( - self.subset_adata, + subset_adata, self.has_splicing, self.has_labeling, self.model, @@ -601,11 +600,11 @@ def estimate(self): valid_bools_ = valid_bools.copy() if self.sanity_check and self.experiment_type.lower() in ["kin", "deg"]: indices_valid_bools = np.where(valid_bools)[0] - t, L = ( - t.flatten(), + self.t, L = ( + self.t.flatten(), (0 if self.Ul is None else self.Ul) + (0 if self.Sl is None else self.Sl), ) - t_uniq = np.unique(t) + t_uniq = np.unique(self.t) valid_gene_checker = np.zeros(gene_num, dtype=bool) for L_iter, cur_L in tqdm( @@ -613,7 +612,7 @@ def estimate(self): desc=f"sanity check of {self.experiment_type} experiment data:", ): cur_L = cur_L.A.flatten() if issparse(cur_L) else cur_L.flatten() - y = strat_mom(cur_L, t, np.nanmean) + y = strat_mom(cur_L, self.t, np.nanmean) slope, _ = fit_linreg(t_uniq, y, intercept=True, r2=False) valid_gene_checker[L_iter] = ( True @@ -635,7 +634,7 @@ def estimate(self): (None if self.S is None else self.S[valid_gene_checker, :]), (None if self.Sl is None else self.Sl[valid_gene_checker, :]), ) - self.subset_adata = self.subset_adata[:, valid_gene_checker] + subset_adata = subset_adata[:, valid_gene_checker] self.adata.var[kin_param_pre + "sanity_check"] = valid_bools_ if self.assumption_mRNA.lower() == "auto": @@ -671,6 +670,7 @@ def estimate(self): self.dynamics_ss( cur_grp_i=cur_grp_i, cur_grp=cur_grp, + subset_adata=subset_adata, cur_cells_bools=cur_cells_bools, valid_bools_=valid_bools_, kin_param_pre=kin_param_pre, @@ -679,6 +679,7 @@ def estimate(self): self.dynamics_kin( cur_grp_i=cur_grp_i, cur_grp=cur_grp, + subset_adata=subset_adata, cur_cells_bools=cur_cells_bools, valid_bools_=valid_bools_, kin_param_pre=kin_param_pre, @@ -703,8 +704,8 @@ def estimate(self): "filter_gene_mode": filter_gene_mode, "t": self.t, "group": self.group, - "X_data": X_data, - "X_fit_data": X_fit_data, + "X_data": self.X_data, + "X_fit_data": self.X_fit_data, "asspt_mRNA": self.assumption_mRNA, "experiment_type": self.experiment_type, "normalized": self.normalized, From b689c55232101c1c8622d53b903697e241380971 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 2 Jun 2023 18:17:58 -0400 Subject: [PATCH 03/31] add sanity check method --- dynamo/tools/dynamics.py | 82 ++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index f950e411d..512072322 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -112,9 +112,6 @@ def __init__( self.tkey = adata.uns["pp"]["tkey"] if tkey is None else tkey self.est_kwargs = est_kwargs - def check_model(self, model): - pass - def _calc_vel_utils_ss(self, vel, U1, S1, U2, S2): if self.has_splicing: if self.experiment_type == "kin": @@ -532,6 +529,46 @@ def smooth(self, valid_bools): f"performed. Try setting re_smooth = True if not sure." ) + def sanity_check(self, valid_bools, valid_bools_, gene_num, subset_adata, kin_param_pre): + indices_valid_bools = np.where(valid_bools)[0] + self.t, L = ( + self.t.flatten(), + (0 if self.Ul is None else self.Ul) + (0 if self.Sl is None else self.Sl), + ) + t_uniq = np.unique(self.t) + + valid_gene_checker = np.zeros(gene_num, dtype=bool) + for L_iter, cur_L in tqdm( + enumerate(L), + desc=f"sanity check of {self.experiment_type} experiment data:", + ): + cur_L = cur_L.A.flatten() if issparse(cur_L) else cur_L.flatten() + y = strat_mom(cur_L, self.t, np.nanmean) + slope, _ = fit_linreg(t_uniq, y, intercept=True, r2=False) + valid_gene_checker[L_iter] = ( + True + if (slope > 0 and self.experiment_type == "kin") or (slope < 0 and self.experiment_type == "deg") + else False + ) + valid_bools_[indices_valid_bools[~valid_gene_checker]] = False + main_warning(f"filtering {gene_num - valid_gene_checker.sum()} genes after sanity check.") + + if len(valid_bools_) < 5: + raise Exception( + f"After sanity check, you have less than 5 valid genes. Something is wrong about your " + f"metabolic labeling experiment!" + ) + + self.U, self.Ul, self.S, self.Sl = ( + (None if self.U is None else self.U[valid_gene_checker, :]), + (None if self.Ul is None else self.Ul[valid_gene_checker, :]), + (None if self.S is None else self.S[valid_gene_checker, :]), + (None if self.Sl is None else self.Sl[valid_gene_checker, :]), + ) + subset_adata = subset_adata[:, valid_gene_checker] + self.adata.var[kin_param_pre + "sanity_check"] = valid_bools_ + return subset_adata, valid_bools_ + def estimate(self): (self.experiment_type, self.has_splicing, self.has_labeling, self.splicing_labeling, self.has_protein,) = ( self.adata.uns["pp"]["experiment_type"], @@ -599,43 +636,8 @@ def estimate(self): valid_bools_ = valid_bools.copy() if self.sanity_check and self.experiment_type.lower() in ["kin", "deg"]: - indices_valid_bools = np.where(valid_bools)[0] - self.t, L = ( - self.t.flatten(), - (0 if self.Ul is None else self.Ul) + (0 if self.Sl is None else self.Sl), - ) - t_uniq = np.unique(self.t) - - valid_gene_checker = np.zeros(gene_num, dtype=bool) - for L_iter, cur_L in tqdm( - enumerate(L), - desc=f"sanity check of {self.experiment_type} experiment data:", - ): - cur_L = cur_L.A.flatten() if issparse(cur_L) else cur_L.flatten() - y = strat_mom(cur_L, self.t, np.nanmean) - slope, _ = fit_linreg(t_uniq, y, intercept=True, r2=False) - valid_gene_checker[L_iter] = ( - True - if (slope > 0 and self.experiment_type == "kin") or (slope < 0 and self.experiment_type == "deg") - else False - ) - valid_bools_[indices_valid_bools[~valid_gene_checker]] = False - main_warning(f"filtering {gene_num - valid_gene_checker.sum()} genes after sanity check.") - - if len(valid_bools_) < 5: - raise Exception( - f"After sanity check, you have less than 5 valid genes. Something is wrong about your " - f"metabolic labeling experiment!" - ) - - self.U, self.Ul, self.S, self.Sl = ( - (None if self.U is None else self.U[valid_gene_checker, :]), - (None if self.Ul is None else self.Ul[valid_gene_checker, :]), - (None if self.S is None else self.S[valid_gene_checker, :]), - (None if self.Sl is None else self.Sl[valid_gene_checker, :]), - ) - subset_adata = subset_adata[:, valid_gene_checker] - self.adata.var[kin_param_pre + "sanity_check"] = valid_bools_ + subset_adata, valid_bools_ = self.sanity_check( + valid_bools, valid_bools_, gene_num, subset_adata, kin_param_pre) if self.assumption_mRNA.lower() == "auto": self.assumption_mRNA = assump_mRNA From 4ebf215f84af04ed48b24bd92bf323112f157b67 Mon Sep 17 00:00:00 2001 From: sichao Date: Wed, 7 Jun 2023 17:20:06 -0400 Subject: [PATCH 04/31] optimize structure --- dynamo/tools/dynamics.py | 119 +++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 62 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 512072322..e3d84b7e1 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -112,20 +112,20 @@ def __init__( self.tkey = adata.uns["pp"]["tkey"] if tkey is None else tkey self.est_kwargs = est_kwargs - def _calc_vel_utils_ss(self, vel, U1, S1, U2, S2): + def _calc_vel_utils_ss(self, vel, U, S, N, T): if self.has_splicing: if self.experiment_type == "kin": Kc = np.clip(self.gamma[:, None], 0, 1 - 1e-3) # S - U slope gamma_ = -(np.log(1 - Kc) / self.t[None, :]) # actual gamma - vel_U = U2.multiply(csr_matrix(gamma_ / Kc)) - csr_matrix(self.beta).multiply( - U1) # vel.vel_s(U_) - vel_S = vel.vel_s(U1, S1) + vel_U = N.multiply(csr_matrix(gamma_ / Kc)) - csr_matrix(self.beta).multiply( + U) # vel.vel_s(U_) + vel_S = vel.vel_s(U, S) - vel_N = (U2 - csr_matrix(Kc).multiply(U2)).multiply( + vel_N = (N - csr_matrix(Kc).multiply(N)).multiply( csr_matrix(gamma_ / Kc)) # vel.vel_u(U) # scale back to true velocity via multiplying "gamma_ / Kc". - vel_T = (U2 - csr_matrix(Kc).multiply(S2)).multiply(csr_matrix(gamma_ / Kc)) + vel_T = (N - csr_matrix(Kc).multiply(T)).multiply(csr_matrix(gamma_ / Kc)) elif self.experiment_type == "mix_std_stm": # steady state RNA: u0, stimulation RNA: u_new; # cell-wise transcription rate under simulation: alpha1 @@ -134,18 +134,18 @@ def _calc_vel_utils_ss(self, vel, U1, S1, U2, S2): t1=self.t, alpha0=self.alpha[0], beta=self.beta, - u1=U2, + u1=N, ) - vel_U = alpha1 - csr_matrix(self.beta[:, None]).multiply(U1) - vel_S = vel.vel_s(U1, S1) + vel_U = alpha1 - csr_matrix(self.beta[:, None]).multiply(U) + vel_S = vel.vel_s(U, S) vel_N = alpha1 - csr_matrix(self.gamma[:, None]).multiply(u_new) - vel_T = alpha1 - csr_matrix(self.beta[:, None]).multiply(S2) + vel_T = alpha1 - csr_matrix(self.beta[:, None]).multiply(T) else: - vel_U = vel.vel_u(U1) - vel_S = vel.vel_s(U1, S1) - vel_N = vel.vel_u(U2) - vel_T = vel.vel_s(U2, S2 - U2) # need to consider splicing + vel_U = vel.vel_u(U) + vel_S = vel.vel_s(U, S) + vel_N = vel.vel_u(N) + vel_T = vel.vel_s(N, T - N) # need to consider splicing else: if self.experiment_type == "kin": vel_U = np.nan @@ -153,10 +153,10 @@ def _calc_vel_utils_ss(self, vel, U1, S1, U2, S2): Kc = np.clip(self.gamma[:, None], 0, 1 - 1e-3) # S - U slope gamma_ = -(np.log(1 - Kc) / self.t[None, :]) # actual gamma - vel_N = (U2 - csr_matrix(Kc).multiply(U2)).multiply( + vel_N = (N - csr_matrix(Kc).multiply(N)).multiply( csr_matrix(gamma_ / Kc)) # vel.vel_u(U) # scale back to true velocity via multiplying "gamma_ / Kc". - vel_T = (U2 - csr_matrix(Kc).multiply(S2)).multiply(csr_matrix(gamma_ / Kc)) + vel_T = (N - csr_matrix(Kc).multiply(T)).multiply(csr_matrix(gamma_ / Kc)) elif self.experiment_type == "mix_std_stm": vel_U = np.nan vel_S = np.nan @@ -172,51 +172,51 @@ def _calc_vel_utils_ss(self, vel, U1, S1, U2, S2): ) vel_N = alpha1 - csr_matrix(self.gamma[:, None]).multiply(u_new) - vel_T = alpha1 - csr_matrix(self.gamma[:, None]).multiply(S2) + vel_T = alpha1 - csr_matrix(self.gamma[:, None]).multiply(T) else: vel_U = np.nan vel_S = np.nan - vel_N = vel.vel_u(U2) - vel_T = vel.vel_u(S2) # don't consider splicing + vel_N = vel.vel_u(N) + vel_T = vel.vel_u(T) # don't consider splicing return vel_U, vel_S, vel_N, vel_T - def _calc_vel_utils_kin(self, vel, U1, S1, U2, S2): + def _calc_vel_utils_kin(self, vel, U, S, N, T): if self.has_splicing: if self.experiment_type == "kin": - vel_U = vel.vel_u(U1) - vel_S = vel.vel_s(U1, S1) + vel_U = vel.vel_u(U) + vel_S = vel.vel_s(U, S) vel.parameters["beta"] = self.gamma - vel_N = vel.vel_u(U2) - vel_T = vel.vel_u(S2) # no need to consider splicing + vel_N = vel.vel_u(N) + vel_T = vel.vel_u(T) # no need to consider splicing elif self.experiment_type == "deg": if self.splicing_labeling: vel_U = np.nan - vel_S = vel.vel_s(U1, S1) + vel_S = vel.vel_s(U, S) vel_N = np.nan vel_T = np.nan else: vel_U = np.nan - vel_S = vel.vel_s(U1, S1) + vel_S = vel.vel_s(U, S) vel_N = np.nan vel_T = np.nan elif self.experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: - vel_U = vel.vel_u(U1, repeat=True) - vel_S = vel.vel_s(U1, S1) + vel_U = vel.vel_u(U, repeat=True) + vel_S = vel.vel_s(U, S) vel.parameters["beta"] = self.gamma - vel_N = vel.vel_u(U2, repeat=True) - vel_T = vel.vel_u(S2, repeat=True) # no need to consider splicing + vel_N = vel.vel_u(N, repeat=True) + vel_T = vel.vel_u(T, repeat=True) # no need to consider splicing else: if self.experiment_type == "kin": vel_U = np.nan vel_S = np.nan # calculate cell-wise alpha, if est_method is twostep, this can be skipped - alpha_ = one_shot_alpha_matrix(U2, self.gamma, self.t) + alpha_ = one_shot_alpha_matrix(N, self.gamma, self.t) vel.parameters["alpha"] = alpha_ - vel_N = vel.vel_u(U2) - vel_T = vel.vel_u(S2) # don't consider splicing + vel_N = vel.vel_u(N) + vel_T = vel.vel_u(T) # don't consider splicing elif self.experiment_type == "deg": vel_U = np.nan vel_S = np.nan @@ -225,16 +225,9 @@ def _calc_vel_utils_kin(self, vel, U1, S1, U2, S2): elif self.experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: vel_U = np.nan vel_S = np.nan - vel_N = vel.vel_u(U2, repeat=True) + vel_N = vel.vel_u(N, repeat=True) # TODO: figure out whether we need repeat here - vel_T = vel.vel_u(S2, repeat=True) # don't consider splicing - return vel_U, vel_S, vel_N, vel_T - - def _calc_vel_utils(self, vel, vel_func, U, S, U_, S_): - if self.NTR_vel: - vel_U, vel_S, vel_N, vel_T = vel_func(vel=vel, U1=U_, S1=S_, U2=U, S2=S) - else: - vel_U, vel_S, vel_N, vel_T = vel_func(vel=vel, U1=U, S1=S, U2=U_, S2=S_) + vel_T = vel.vel_u(T, repeat=True) # don't consider splicing return vel_U, vel_S, vel_N, vel_T def calculate_velocity_ss(self, subset_adata): @@ -244,7 +237,15 @@ def calculate_velocity_ss(self, subset_adata): self.has_splicing, self.has_labeling, self.log_unnormalized, - self.NTR_vel, + False, + ) + N, T = get_U_S_for_velocity_estimation( + subset_adata, + self.use_smoothed, + self.has_splicing, + self.has_labeling, + self.log_unnormalized, + True, ) vel = Velocity(estimation=self.est) @@ -254,21 +255,17 @@ def calculate_velocity_ss(self, subset_adata): "kin", "mix_std_stm", ]: - U_, S_ = get_U_S_for_velocity_estimation( - subset_adata, - self.use_smoothed, - self.has_splicing, - self.has_labeling, - self.log_unnormalized, - not self.NTR_vel, - ) - vel_U, vel_S, vel_N, vel_T = self._calc_vel_utils(vel=vel, vel_func=self._calc_vel_utils_ss, U=U, S=S, U_=U_, S_=S_) + vel_U, vel_S, vel_N, vel_T = self._calc_vel_utils_ss(vel=vel, U=U, S=S, N=N, T=T) else: - vel_U = vel.vel_u(U) - vel_S = vel.vel_s(U, S) + if self.NTR_vel: + vel_U = vel.vel_u(N) + vel_S = vel.vel_s(N, T) + else: + vel_U = vel.vel_u(U) + vel_S = vel.vel_s(U, S) vel_N, vel_T = np.nan, np.nan - vel_P = vel.vel_p(S, self.P) + vel_P = vel.vel_p(T, self.P) if self.NTR_vel else vel.vel_p(S, self.P) return vel_U, vel_S, vel_N, vel_T, vel_P @@ -276,27 +273,25 @@ def calculate_velocity_kin(self, subset_adata): # if alpha = None, set alpha to be U; N - gamma R params = {"alpha": self.alpha, "beta": self.beta, "gamma": self.gamma, "t": self.t} vel = Velocity(**params) - # Fix below: U, S = get_U_S_for_velocity_estimation( subset_adata, self.use_smoothed, self.has_splicing, self.has_labeling, self.log_unnormalized, - self.NTR_vel, + False, ) - - U_, S_ = get_U_S_for_velocity_estimation( + N, T = get_U_S_for_velocity_estimation( subset_adata, self.use_smoothed, self.has_splicing, self.has_labeling, self.log_unnormalized, - not self.NTR_vel, + True, ) - vel_U, vel_S, vel_N, vel_T = self._calc_vel_utils(vel=vel, vel_func=self._calc_vel_utils_kin, U=U, S=S, U_=U_, S_=S_) + vel_U, vel_S, vel_N, vel_T = self._calc_vel_utils_kin(vel=vel, U=U, S=S, N=N, T=T) - vel_P = vel.vel_p(S, self.P) + vel_P = vel.vel_p(T, self.P) if self.NTR_vel else vel.vel_p(S, self.P) return vel_U, vel_S, vel_N, vel_T, vel_P From 0de649fbc210b3274757c771bbe3d011429d11d2 Mon Sep 17 00:00:00 2001 From: sichao Date: Mon, 12 Jun 2023 18:16:34 -0400 Subject: [PATCH 05/31] reconstruct basic framework --- dynamo/tools/dynamics.py | 540 ++++++++++++++++++--------------------- 1 file changed, 250 insertions(+), 290 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index e3d84b7e1..7c203e23b 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -52,7 +52,7 @@ warnings.simplefilter("ignore", SparseEfficiencyWarning) -class Dynamics: +class BaseDynamics: def __init__( self, adata: AnnData, @@ -61,6 +61,12 @@ def __init__( assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", assumption_protein: Literal["ss"] = "ss", model: Literal["auto", "deterministic", "stochastic"] = "auto", + model_was_auto: bool = True, + experiment_type: str = None, + has_splicing: bool = True, + has_labeling: bool = False, + splicing_labeling: bool = False, + has_protein: bool = False, est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", NTR_vel: bool = False, group: Optional[str] = None, @@ -76,19 +82,18 @@ def __init__( tkey: str = None, **est_kwargs, ): - if "pp" not in adata.uns_keys(): - raise ValueError(f"\nPlease run `dyn.pp.receipe_monocle(adata)` before running this function!") self.adata = adata self.filter_gene_mode = filter_gene_mode self.use_smoothed = use_smoothed self.assumption_mRNA = assumption_mRNA self.assumption_protein = assumption_protein - if model.lower() == "auto": - self.model = "stochastic" - self.model_was_auto = True - else: - self.model = model - self.model_was_auto = False + self.model = model + self.model_was_auto = model_was_auto + self.experiment_type = experiment_type + self.has_splicing = has_splicing + self.has_labeling = has_labeling + self.splicing_labeling = splicing_labeling + self.has_protein = has_protein self.est_method = est_method self.NTR_vel = NTR_vel self.group = group @@ -112,255 +117,7 @@ def __init__( self.tkey = adata.uns["pp"]["tkey"] if tkey is None else tkey self.est_kwargs = est_kwargs - def _calc_vel_utils_ss(self, vel, U, S, N, T): - if self.has_splicing: - if self.experiment_type == "kin": - Kc = np.clip(self.gamma[:, None], 0, 1 - 1e-3) # S - U slope - gamma_ = -(np.log(1 - Kc) / self.t[None, :]) # actual gamma - - vel_U = N.multiply(csr_matrix(gamma_ / Kc)) - csr_matrix(self.beta).multiply( - U) # vel.vel_s(U_) - vel_S = vel.vel_s(U, S) - - vel_N = (N - csr_matrix(Kc).multiply(N)).multiply( - csr_matrix(gamma_ / Kc)) # vel.vel_u(U) - # scale back to true velocity via multiplying "gamma_ / Kc". - vel_T = (N - csr_matrix(Kc).multiply(T)).multiply(csr_matrix(gamma_ / Kc)) - elif self.experiment_type == "mix_std_stm": - # steady state RNA: u0, stimulation RNA: u_new; - # cell-wise transcription rate under simulation: alpha1 - u0, u_new, alpha1 = solve_alpha_2p_mat( - t0=np.max(self.t) - self.t, - t1=self.t, - alpha0=self.alpha[0], - beta=self.beta, - u1=N, - ) - vel_U = alpha1 - csr_matrix(self.beta[:, None]).multiply(U) - vel_S = vel.vel_s(U, S) - - vel_N = alpha1 - csr_matrix(self.gamma[:, None]).multiply(u_new) - vel_T = alpha1 - csr_matrix(self.beta[:, None]).multiply(T) - else: - vel_U = vel.vel_u(U) - vel_S = vel.vel_s(U, S) - vel_N = vel.vel_u(N) - vel_T = vel.vel_s(N, T - N) # need to consider splicing - else: - if self.experiment_type == "kin": - vel_U = np.nan - vel_S = np.nan - - Kc = np.clip(self.gamma[:, None], 0, 1 - 1e-3) # S - U slope - gamma_ = -(np.log(1 - Kc) / self.t[None, :]) # actual gamma - vel_N = (N - csr_matrix(Kc).multiply(N)).multiply( - csr_matrix(gamma_ / Kc)) # vel.vel_u(U) - # scale back to true velocity via multiplying "gamma_ / Kc". - vel_T = (N - csr_matrix(Kc).multiply(T)).multiply(csr_matrix(gamma_ / Kc)) - elif self.experiment_type == "mix_std_stm": - vel_U = np.nan - vel_S = np.nan - - # steady state RNA: u0, stimulation RNA: u_new; - # cell-wise transcription rate under simulation: alpha1 - u0, u_new, alpha1 = solve_alpha_2p_mat( - t0=np.max(self.t) - self.t, - t1=self.t, - alpha0=self.alpha[0], - beta=self.gamma, - u1=self.U, - ) - - vel_N = alpha1 - csr_matrix(self.gamma[:, None]).multiply(u_new) - vel_T = alpha1 - csr_matrix(self.gamma[:, None]).multiply(T) - else: - vel_U = np.nan - vel_S = np.nan - vel_N = vel.vel_u(N) - vel_T = vel.vel_u(T) # don't consider splicing - return vel_U, vel_S, vel_N, vel_T - - def _calc_vel_utils_kin(self, vel, U, S, N, T): - if self.has_splicing: - if self.experiment_type == "kin": - vel_U = vel.vel_u(U) - vel_S = vel.vel_s(U, S) - vel.parameters["beta"] = self.gamma - vel_N = vel.vel_u(N) - vel_T = vel.vel_u(T) # no need to consider splicing - elif self.experiment_type == "deg": - if self.splicing_labeling: - vel_U = np.nan - vel_S = vel.vel_s(U, S) - vel_N = np.nan - vel_T = np.nan - else: - vel_U = np.nan - vel_S = vel.vel_s(U, S) - vel_N = np.nan - vel_T = np.nan - elif self.experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: - vel_U = vel.vel_u(U, repeat=True) - vel_S = vel.vel_s(U, S) - vel.parameters["beta"] = self.gamma - vel_N = vel.vel_u(N, repeat=True) - vel_T = vel.vel_u(T, repeat=True) # no need to consider splicing - else: - if self.experiment_type == "kin": - vel_U = np.nan - vel_S = np.nan - - # calculate cell-wise alpha, if est_method is twostep, this can be skipped - alpha_ = one_shot_alpha_matrix(N, self.gamma, self.t) - - vel.parameters["alpha"] = alpha_ - - vel_N = vel.vel_u(N) - vel_T = vel.vel_u(T) # don't consider splicing - elif self.experiment_type == "deg": - vel_U = np.nan - vel_S = np.nan - vel_N = np.nan - vel_T = np.nan - elif self.experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: - vel_U = np.nan - vel_S = np.nan - vel_N = vel.vel_u(N, repeat=True) - # TODO: figure out whether we need repeat here - vel_T = vel.vel_u(T, repeat=True) # don't consider splicing - return vel_U, vel_S, vel_N, vel_T - - def calculate_velocity_ss(self, subset_adata): - U, S = get_U_S_for_velocity_estimation( - subset_adata, - self.use_smoothed, - self.has_splicing, - self.has_labeling, - self.log_unnormalized, - False, - ) - N, T = get_U_S_for_velocity_estimation( - subset_adata, - self.use_smoothed, - self.has_splicing, - self.has_labeling, - self.log_unnormalized, - True, - ) - vel = Velocity(estimation=self.est) - - if self.experiment_type.lower() in [ - "one_shot", - "one-shot", - "kin", - "mix_std_stm", - ]: - vel_U, vel_S, vel_N, vel_T = self._calc_vel_utils_ss(vel=vel, U=U, S=S, N=N, T=T) - else: - if self.NTR_vel: - vel_U = vel.vel_u(N) - vel_S = vel.vel_s(N, T) - else: - vel_U = vel.vel_u(U) - vel_S = vel.vel_s(U, S) - vel_N, vel_T = np.nan, np.nan - - vel_P = vel.vel_p(T, self.P) if self.NTR_vel else vel.vel_p(S, self.P) - - return vel_U, vel_S, vel_N, vel_T, vel_P - - def calculate_velocity_kin(self, subset_adata): - # if alpha = None, set alpha to be U; N - gamma R - params = {"alpha": self.alpha, "beta": self.beta, "gamma": self.gamma, "t": self.t} - vel = Velocity(**params) - U, S = get_U_S_for_velocity_estimation( - subset_adata, - self.use_smoothed, - self.has_splicing, - self.has_labeling, - self.log_unnormalized, - False, - ) - N, T = get_U_S_for_velocity_estimation( - subset_adata, - self.use_smoothed, - self.has_splicing, - self.has_labeling, - self.log_unnormalized, - True, - ) - vel_U, vel_S, vel_N, vel_T = self._calc_vel_utils_kin(vel=vel, U=U, S=S, N=N, T=T) - - vel_P = vel.vel_p(T, self.P) if self.NTR_vel else vel.vel_p(S, self.P) - - return vel_U, vel_S, vel_N, vel_T, vel_P - - def set_velocity_ss(self, vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre): - self.adata = set_velocity( - self.adata, - vel_U, - vel_S, - vel_N, - vel_T, - vel_P, - self._group, - cur_grp, - cur_cells_bools, - valid_bools_, - self.ind_for_proteins, - ) - - self.adata = set_param_ss( - self.adata, - self.est, - self.alpha, - self.beta, - self.gamma, - self.eta, - self.delta, - self.experiment_type, - self._group, - cur_grp, - kin_param_pre, - valid_bools_, - self.ind_for_proteins, - ) - - def set_velocity_kin(self, vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre, extra_params): - self.adata = set_velocity( - self.adata, - vel_U, - vel_S, - vel_N, - vel_T, - vel_P, - self._group, - cur_grp, - cur_cells_bools, - valid_bools_, - self.ind_for_proteins, - ) - - self.adata = set_param_kinetic( - self.adata, - self.alpha, - self.a, - self.b, - self.alpha_a, - self.alpha_i, - self.beta, - self.gamma, - self.cost, - self.logLL, - kin_param_pre, - extra_params, - self._group, - cur_grp, - cur_cells_bools, - valid_bools_, - ) - - def estimate_vel_calc_params_ss(self, subset_adata): + def estimate_params_ss(self, subset_adata, **est_params_args): if self.est_method.lower() == "auto": self.est_method = "gmm" if self.model.lower() == "stochastic" else "ols" @@ -404,7 +161,7 @@ def estimate_vel_calc_params_ss(self, subset_adata): self.alpha, self.beta, self.gamma, self.eta, self.delta = self.est.parameters.values() - def estimate_vel_calc_params_kin(self, cur_grp_i, cur_grp, subset_adata): + def estimate_params_kin(self, cur_grp_i, cur_grp, subset_adata, **est_params_args): return_ntr = True if self.fraction_for_deg and self.experiment_type.lower() == "deg" else False if self.model_was_auto and self.experiment_type.lower() == "kin": @@ -469,18 +226,130 @@ def estimate_vel_calc_params_kin(self, cur_grp_i, cur_grp, subset_adata): "gamma", ] - extra_params = params.loc[:, params.columns.difference(all_kinetic_params)] - return extra_params + self.kin_extra_params = params.loc[:, params.columns.difference(all_kinetic_params)] + + def estimate_parameters(self, cur_grp_i, cur_grp, subset_adata, **est_params_args): + if self.assumption_mRNA.lower() == "ss" or (self.experiment_type.lower() in ["one-shot", "mix_std_stm"]): + self.estimate_params_ss(subset_adata=subset_adata, **est_params_args) + elif self.assumption_mRNA.lower() == "kinetic": + self.estimate_params_kin(cur_grp_i=cur_grp_i, cur_grp=cur_grp, subset_adata=subset_adata, **est_params_args) + else: + main_warning("Not implemented yet.") + + def set_velocity(self, vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre, **set_velo_args): + if self.assumption_mRNA.lower() == "ss" or (self.experiment_type.lower() in ["one-shot", "mix_std_stm"]): + self.adata = set_velocity( + self.adata, + vel_U, + vel_S, + vel_N, + vel_T, + vel_P, + self._group, + cur_grp, + cur_cells_bools, + valid_bools_, + self.ind_for_proteins, + ) + + self.adata = set_param_ss( + self.adata, + self.est, + self.alpha, + self.beta, + self.gamma, + self.eta, + self.delta, + self.experiment_type, + self._group, + cur_grp, + kin_param_pre, + valid_bools_, + self.ind_for_proteins, + ) + elif self.assumption_mRNA.lower() == "kinetic": + self.adata = set_velocity( + self.adata, + vel_U, + vel_S, + vel_N, + vel_T, + vel_P, + self._group, + cur_grp, + cur_cells_bools, + valid_bools_, + self.ind_for_proteins, + ) + + self.adata = set_param_kinetic( + self.adata, + self.alpha, + self.a, + self.b, + self.alpha_a, + self.alpha_i, + self.beta, + self.gamma, + self.cost, + self.logLL, + kin_param_pre, + self.kin_extra_params, + self._group, + cur_grp, + cur_cells_bools, + valid_bools_, + ) + else: + main_warning("Not implemented yet.") + + def calculate_vel_U(self, vel, U, S, N, T): + raise NotImplementedError("This method has not been implemented.") + + def calculate_vel_S(self, vel, U, S, N, T): + raise NotImplementedError("This method has not been implemented.") + + def calculate_vel_N(self, vel, U, S, N, T): + raise NotImplementedError("This method has not been implemented.") - def dynamics_ss(self, cur_grp_i, cur_grp, subset_adata, cur_cells_bools, valid_bools_, kin_param_pre): - self.estimate_vel_calc_params_ss(subset_adata=subset_adata) - vel_U, vel_S, vel_N, vel_T, vel_P = self.calculate_velocity_ss(subset_adata=subset_adata) - self.set_velocity_ss(vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre) + def calculate_vel_T(self, vel, U, S, N, T): + raise NotImplementedError("This method has not been implemented.") - def dynamics_kin(self, cur_grp_i, cur_grp, subset_adata, cur_cells_bools, valid_bools_, kin_param_pre): - extra_params = self.estimate_vel_calc_params_kin(cur_grp_i=cur_grp_i, cur_grp=cur_grp, subset_adata=subset_adata) - vel_U, vel_S, vel_N, vel_T, vel_P = self.calculate_velocity_kin(subset_adata=subset_adata) - self.set_velocity_kin(vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre, extra_params) + def calculate_vel_P(self, vel, U, S, N, T): + return vel.vel_p(T, self.P) if self.NTR_vel else vel.vel_p(S, self.P) + + def calculate_velocity(self, subset_adata): + U, S = get_U_S_for_velocity_estimation( + subset_adata, + self.use_smoothed, + self.has_splicing, + self.has_labeling, + self.log_unnormalized, + False, + ) + N, T = get_U_S_for_velocity_estimation( + subset_adata, + self.use_smoothed, + self.has_splicing, + self.has_labeling, + self.log_unnormalized, + True, + ) + if self.assumption_mRNA.lower() == "ss" or (self.experiment_type.lower() in ["one-shot", "mix_std_stm"]): + vel = Velocity(estimation=self.est) + elif self.assumption_mRNA.lower() == "kinetic": + params = {"alpha": self.alpha, "beta": self.beta, "gamma": self.gamma, "t": self.t} + vel = Velocity(**params) + else: + main_warning("Not implemented yet.") + + vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + vel_P = self.calculate_vel_P(vel=vel, U=U, S=S, N=N, T=T) + + return vel_U, vel_S, vel_N, vel_T, vel_P def filter(self): filter_list, filter_gene_mode_list = ( @@ -663,27 +532,9 @@ def estimate(self): ]: self.model = "deterministic" - if self.assumption_mRNA.lower() == "ss" or (self.experiment_type.lower() in ["one-shot", "mix_std_stm"]): - self.dynamics_ss( - cur_grp_i=cur_grp_i, - cur_grp=cur_grp, - subset_adata=subset_adata, - cur_cells_bools=cur_cells_bools, - valid_bools_=valid_bools_, - kin_param_pre=kin_param_pre, - ) - elif self.assumption_mRNA.lower() == "kinetic": - self.dynamics_kin( - cur_grp_i=cur_grp_i, - cur_grp=cur_grp, - subset_adata=subset_adata, - cur_cells_bools=cur_cells_bools, - valid_bools_=valid_bools_, - kin_param_pre=kin_param_pre, - ) - # add protein related parameters in the moment model below: - elif self.model.lower() == "model_selection": - main_warning("Not implemented yet.") + self.estimate_parameters(cur_grp_i=cur_grp_i, cur_grp=cur_grp, subset_adata=subset_adata) + vel_U, vel_S, vel_N, vel_T, vel_P = self.calculate_velocity(subset_adata=subset_adata) + self.set_velocity(vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre) if self.group is not None and self.group in self.adata.obs[self.group]: uns_key = self.group + "_dynamics" @@ -724,6 +575,115 @@ def estimate(self): return self.adata +class SplicedDynamics(BaseDynamics): + # TODO: make sure NTR_vel is False when initialization and remove NTR_vel here + def calculate_vel_U(self, vel, U, S, N, T): + return vel.vel_u(N) if self.NTR_vel else vel.vel_u(U) + + def calculate_vel_S(self, vel, U, S, N, T): + return vel.vel_s(N, T) if self.NTR_vel else vel.vel_s(U, S) + + def calculate_vel_N(self, vel, U, S, N, T): + return np.nan + + def calculate_vel_T(self, vel, U, S, N, T): + return np.nan + + +class LabeledDynamics(BaseDynamics): + def calculate_vel_U(self, vel, U, S, N, T): + return vel.vel_u(U) + + def calculate_vel_S(self, vel, U, S, N, T): + return vel.vel_s(U, S) + + def calculate_vel_N(self, vel, U, S, N, T): + return vel.vel_u(N) + + def calculate_vel_T(self, vel, U, S, N, T): + return vel.vel_s(N, T - N) + + def calculate_velocity(self, subset_adata): + U, S = get_U_S_for_velocity_estimation( + subset_adata, + self.use_smoothed, + self.has_splicing, + self.has_labeling, + self.log_unnormalized, + False, + ) + N, T = get_U_S_for_velocity_estimation( + subset_adata, + self.use_smoothed, + self.has_splicing, + self.has_labeling, + self.log_unnormalized, + True, + ) + if self.assumption_mRNA.lower() == "ss" or (self.experiment_type.lower() in ["one-shot", "mix_std_stm"]): + vel = Velocity(estimation=self.est) + elif self.assumption_mRNA.lower() == "kinetic": + params = {"alpha": self.alpha, "beta": self.beta, "gamma": self.gamma, "t": self.t} + vel = Velocity(**params) + else: + main_warning("Not implemented yet.") + + if self.has_splicing: + vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + else: + vel_U = np.nan + vel_S = np.nan + vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + vel_P = self.calculate_vel_P(vel=vel, U=U, S=S, N=N, T=T) + + return vel_U, vel_S, vel_N, vel_T, vel_P + + +# TODO: rename this later +def dynamics_wrapper( + adata: AnnData, + filter_gene_mode: Literal["final", "basic", "no"] = "final", + use_smoothed: bool = True, + assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", + assumption_protein: Literal["ss"] = "ss", + model: Literal["auto", "deterministic", "stochastic"] = "auto", + est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", + NTR_vel: bool = False, + group: Optional[str] = None, + protein_names: Optional[List[str]] = None, + concat_data: bool = False, + log_unnormalized: bool = True, + one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", + fraction_for_deg: bool = False, + re_smooth: bool = False, + sanity_check: bool = False, + del_2nd_moments: Optional[bool] = None, + cores: int = 1, + tkey: str = None, + **est_kwargs, +) -> AnnData: + """Run corresponding Dynamics methods according to the parameters.""" + if "pp" not in adata.uns_keys(): + raise ValueError(f"\nPlease run `dyn.pp.receipe_monocle(adata)` before running this function!") + if model.lower() == "auto": + model = "stochastic" + model_was_auto = True + else: + model = model + model_was_auto = False + + (experiment_type, has_splicing, has_labeling, splicing_labeling, has_protein,) = ( + adata.uns["pp"]["experiment_type"], + adata.uns["pp"]["has_splicing"], + adata.uns["pp"]["has_labeling"], + adata.uns["pp"]["splicing_labeling"], + adata.uns["pp"]["has_protein"], + ) + pass + + # incorporate the model selection code soon def dynamics( adata: AnnData, From 7057ad1bdc4bfa03bee9200c7ee27678b04a86d0 Mon Sep 17 00:00:00 2001 From: sichao Date: Mon, 12 Jun 2023 19:32:35 -0400 Subject: [PATCH 06/31] implement caller --- dynamo/tools/dynamics.py | 137 +++++++++++++++++++++++---------------- 1 file changed, 82 insertions(+), 55 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 7c203e23b..03daa73bc 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -434,14 +434,6 @@ def sanity_check(self, valid_bools, valid_bools_, gene_num, subset_adata, kin_pa return subset_adata, valid_bools_ def estimate(self): - (self.experiment_type, self.has_splicing, self.has_labeling, self.splicing_labeling, self.has_protein,) = ( - self.adata.uns["pp"]["experiment_type"], - self.adata.uns["pp"]["has_splicing"], - self.adata.uns["pp"]["has_labeling"], - self.adata.uns["pp"]["splicing_labeling"], - self.adata.uns["pp"]["has_protein"], - ) - self.X_data, self.X_fit_data = None, None filter_gene_mode, valid_bools, gene_num = self.filter() @@ -503,35 +495,6 @@ def estimate(self): subset_adata, valid_bools_ = self.sanity_check( valid_bools, valid_bools_, gene_num, subset_adata, kin_param_pre) - if self.assumption_mRNA.lower() == "auto": - self.assumption_mRNA = assump_mRNA - if self.experiment_type.lower() == "conventional": - self.assumption_mRNA = "ss" - elif self.experiment_type.lower() in ["mix_pulse_chase", "deg", "kin"]: - self.assumption_mRNA = "kinetic" - - if self.model.lower() == "stochastic" and self.experiment_type.lower() not in [ - "conventional", - "kinetics", - "degradation", - "kin", - "deg", - "one-shot", - ]: - """ - # temporially convert to deterministic model as moment model for mix_std_stm - and other types of labeling experiment is ongoing.""" - - self.model = "deterministic" - - if self.model_was_auto and self.experiment_type.lower() in [ - "kinetic", - "kin", - "degradation", - "deg", - ]: - self.model = "deterministic" - self.estimate_parameters(cur_grp_i=cur_grp_i, cur_grp=cur_grp, subset_adata=subset_adata) vel_U, vel_S, vel_N, vel_T, vel_P = self.calculate_velocity(subset_adata=subset_adata) self.set_velocity(vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre) @@ -644,25 +607,9 @@ def calculate_velocity(self, subset_adata): # TODO: rename this later def dynamics_wrapper( adata: AnnData, - filter_gene_mode: Literal["final", "basic", "no"] = "final", - use_smoothed: bool = True, assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", - assumption_protein: Literal["ss"] = "ss", model: Literal["auto", "deterministic", "stochastic"] = "auto", - est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", - NTR_vel: bool = False, - group: Optional[str] = None, - protein_names: Optional[List[str]] = None, - concat_data: bool = False, - log_unnormalized: bool = True, - one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", - fraction_for_deg: bool = False, - re_smooth: bool = False, - sanity_check: bool = False, - del_2nd_moments: Optional[bool] = None, - cores: int = 1, - tkey: str = None, - **est_kwargs, + **kwargs, ) -> AnnData: """Run corresponding Dynamics methods according to the parameters.""" if "pp" not in adata.uns_keys(): @@ -681,7 +628,87 @@ def dynamics_wrapper( adata.uns["pp"]["splicing_labeling"], adata.uns["pp"]["has_protein"], ) - pass + + ( + _, + _, + _, + _, + _, + _, + _, + _, + _, + _, + _, + assump_mRNA, + ) = get_data_for_kin_params_estimation( + adata, + has_splicing, + has_labeling, + model, + kwargs["use_smoothed"], + kwargs["tkey"], + kwargs["protein_names"], + kwargs["log_unnormalized"], + kwargs["NTR_vel"], + ) + if assumption_mRNA.lower() == "auto": + assumption_mRNA = assump_mRNA + if experiment_type.lower() == "conventional": + assumption_mRNA = "ss" + elif experiment_type.lower() in ["mix_pulse_chase", "deg", "kin"]: + assumption_mRNA = "kinetic" + + if model.lower() == "stochastic" and experiment_type.lower() not in [ + "conventional", + "kinetics", + "degradation", + "kin", + "deg", + "one-shot", + ]: + """ + # temporially convert to deterministic model as moment model for mix_std_stm + and other types of labeling experiment is ongoing.""" + + model = "deterministic" + + if model_was_auto and experiment_type.lower() in [ + "kinetic", + "kin", + "degradation", + "deg", + ]: + model = "deterministic" + + if experiment_type == "conventional": + estimator = SplicedDynamics( + adata=adata, + assumption_mRNA=assumption_mRNA, + has_splicing=has_splicing, + has_labeling=has_labeling, + splicing_labeling=splicing_labeling, + has_protein=has_protein, + model=model, + model_was_auto=model_was_auto, + **kwargs, + ) + elif experiment_type in ["one-shot", "one_shot"]: + estimator = SplicedDynamics( + adata=adata, + assumption_mRNA=assumption_mRNA, + has_splicing=has_splicing, + has_labeling=has_labeling, + splicing_labeling=splicing_labeling, + has_protein=has_protein, + model=model, + model_was_auto=model_was_auto, + **kwargs, + ) + else: + raise NotImplementedError("This method has not been implemented.") + estimator.estimate() # incorporate the model selection code soon From f1fb1998421178e5de56c0d06a44b4a974a66be0 Mon Sep 17 00:00:00 2001 From: sichao Date: Tue, 13 Jun 2023 11:33:38 -0400 Subject: [PATCH 07/31] add sskin class --- dynamo/tools/dynamics.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 03daa73bc..6ba905e39 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -566,6 +566,9 @@ def calculate_vel_N(self, vel, U, S, N, T): def calculate_vel_T(self, vel, U, S, N, T): return vel.vel_s(N, T - N) + def calc_extra_parameters(self): + pass + def calculate_velocity(self, subset_adata): U, S = get_U_S_for_velocity_estimation( subset_adata, @@ -591,6 +594,7 @@ def calculate_velocity(self, subset_adata): else: main_warning("Not implemented yet.") + self.calc_extra_parameters() if self.has_splicing: vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) @@ -604,6 +608,21 @@ def calculate_velocity(self, subset_adata): return vel_U, vel_S, vel_N, vel_T, vel_P +class SSKineticsDynamics(LabeledDynamics): + def calc_extra_parameters(self): + self.Kc = np.clip(self.gamma[:, None], 0, 1 - 1e-3) # S - U slope + self.gamma_ = -(np.log(1 - self.Kc) / self.t[None, :]) # actual gamma + + def calculate_vel_U(self, vel, U, S, N, T): + return N.multiply(csr_matrix(self.gamma_ / self.Kc)) - csr_matrix(self.beta).multiply(U) # vel.vel_s(U_) + + def calculate_vel_N(self, vel, U, S, N, T): + return (N - csr_matrix(self.Kc).multiply(N)).multiply(csr_matrix(self.gamma_ / self.Kc)) + + def calculate_vel_S(self, vel, U, S, N, T): + return (N - csr_matrix(self.Kc).multiply(T)).multiply(csr_matrix(self.gamma_ / self.Kc)) + + # TODO: rename this later def dynamics_wrapper( adata: AnnData, From d5d7ccea73c4f2cc3e90187030f38d9797b43d6e Mon Sep 17 00:00:00 2001 From: sichao Date: Tue, 13 Jun 2023 15:02:10 -0400 Subject: [PATCH 08/31] modify the structure of labeled dynamics --- dynamo/tools/dynamics.py | 172 ++++++++++++++++++++------------------- 1 file changed, 87 insertions(+), 85 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 6ba905e39..5e70163d4 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -117,7 +117,7 @@ def __init__( self.tkey = adata.uns["pp"]["tkey"] if tkey is None else tkey self.est_kwargs = est_kwargs - def estimate_params_ss(self, subset_adata, **est_params_args): + def _estimate_params_ss(self, subset_adata, **est_params_args): if self.est_method.lower() == "auto": self.est_method = "gmm" if self.model.lower() == "stochastic" else "ols" @@ -161,7 +161,7 @@ def estimate_params_ss(self, subset_adata, **est_params_args): self.alpha, self.beta, self.gamma, self.eta, self.delta = self.est.parameters.values() - def estimate_params_kin(self, cur_grp_i, cur_grp, subset_adata, **est_params_args): + def _estimate_params_kin(self, cur_grp_i, cur_grp, subset_adata, **est_params_args): return_ntr = True if self.fraction_for_deg and self.experiment_type.lower() == "deg" else False if self.model_was_auto and self.experiment_type.lower() == "kin": @@ -230,9 +230,9 @@ def estimate_params_kin(self, cur_grp_i, cur_grp, subset_adata, **est_params_arg def estimate_parameters(self, cur_grp_i, cur_grp, subset_adata, **est_params_args): if self.assumption_mRNA.lower() == "ss" or (self.experiment_type.lower() in ["one-shot", "mix_std_stm"]): - self.estimate_params_ss(subset_adata=subset_adata, **est_params_args) + self._estimate_params_ss(subset_adata=subset_adata, **est_params_args) elif self.assumption_mRNA.lower() == "kinetic": - self.estimate_params_kin(cur_grp_i=cur_grp_i, cur_grp=cur_grp, subset_adata=subset_adata, **est_params_args) + self._estimate_params_kin(cur_grp_i=cur_grp_i, cur_grp=cur_grp, subset_adata=subset_adata, **est_params_args) else: main_warning("Not implemented yet.") @@ -303,19 +303,10 @@ def set_velocity(self, vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_boo else: main_warning("Not implemented yet.") - def calculate_vel_U(self, vel, U, S, N, T): + def _calculate_velocity(self, vel, U, S, N, T): raise NotImplementedError("This method has not been implemented.") - def calculate_vel_S(self, vel, U, S, N, T): - raise NotImplementedError("This method has not been implemented.") - - def calculate_vel_N(self, vel, U, S, N, T): - raise NotImplementedError("This method has not been implemented.") - - def calculate_vel_T(self, vel, U, S, N, T): - raise NotImplementedError("This method has not been implemented.") - - def calculate_vel_P(self, vel, U, S, N, T): + def _calculate_vel_P(self, vel, U, S, N, T): return vel.vel_p(T, self.P) if self.NTR_vel else vel.vel_p(S, self.P) def calculate_velocity(self, subset_adata): @@ -343,15 +334,12 @@ def calculate_velocity(self, subset_adata): else: main_warning("Not implemented yet.") - vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) - vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) - vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) - vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) - vel_P = self.calculate_vel_P(vel=vel, U=U, S=S, N=N, T=T) + vel_U, vel_S, vel_N, vel_T = self._calculate_velocity(vel=vel, U=U, S=S, N=N, T=T) + vel_P = self._calculate_vel_P(vel=vel, U=U, S=S, N=N, T=T) return vel_U, vel_S, vel_N, vel_T, vel_P - def filter(self): + def _filter(self): filter_list, filter_gene_mode_list = ( [ "use_for_pca", @@ -373,7 +361,7 @@ def filter(self): raise Exception(f"no genes pass filter. Try resetting `filter_gene_mode = 'no'` to use all genes.") return filter_gene_mode, valid_bools, gene_num - def smooth(self, valid_bools): + def _smooth(self, valid_bools): M_layers = [i for i in self.adata.layers.keys() if i.startswith("M_")] if len(M_layers) < 2 or self.re_smooth: @@ -393,7 +381,7 @@ def smooth(self, valid_bools): f"performed. Try setting re_smooth = True if not sure." ) - def sanity_check(self, valid_bools, valid_bools_, gene_num, subset_adata, kin_param_pre): + def _sanity_check(self, valid_bools, valid_bools_, gene_num, subset_adata, kin_param_pre): indices_valid_bools = np.where(valid_bools)[0] self.t, L = ( self.t.flatten(), @@ -435,10 +423,10 @@ def sanity_check(self, valid_bools, valid_bools_, gene_num, subset_adata, kin_pa def estimate(self): self.X_data, self.X_fit_data = None, None - filter_gene_mode, valid_bools, gene_num = self.filter() + filter_gene_mode, valid_bools, gene_num = self._filter() if self.model.lower() == "stochastic" or self.use_smoothed or self.re_smooth: - self.smooth(valid_bools=valid_bools) + self._smooth(valid_bools=valid_bools) valid_adata = self.adata[:, valid_bools].copy() if self.group is not None and self.group in self.adata.obs.columns: @@ -539,88 +527,102 @@ def estimate(self): class SplicedDynamics(BaseDynamics): - # TODO: make sure NTR_vel is False when initialization and remove NTR_vel here - def calculate_vel_U(self, vel, U, S, N, T): - return vel.vel_u(N) if self.NTR_vel else vel.vel_u(U) + def _calculate_velocity(self, vel, U, S, N, T): + vel_U = vel.vel_u(U) + vel_S = vel.vel_s(U, S) + vel_N = np.nan + vel_T = np.nan + + +class LabeledDynamics(BaseDynamics): + def _calculate_vel_U(self, vel, U, S, N, T): + raise NotImplementedError("This method has not been implemented.") - def calculate_vel_S(self, vel, U, S, N, T): - return vel.vel_s(N, T) if self.NTR_vel else vel.vel_s(U, S) + def _calculate_vel_S(self, vel, U, S, N, T): + raise NotImplementedError("This method has not been implemented.") - def calculate_vel_N(self, vel, U, S, N, T): - return np.nan + def _calculate_vel_N(self, vel, U, S, N, T): + raise NotImplementedError("This method has not been implemented.") - def calculate_vel_T(self, vel, U, S, N, T): - return np.nan + def _calculate_vel_T(self, vel, U, S, N, T): + raise NotImplementedError("This method has not been implemented.") + def _calculate_velocity(self, vel, U, S, N, T): + if self.has_splicing: + vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + else: + vel_U, vel_S = np.nan, np.nan + vel_N = self._calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self._calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + return vel_U, vel_S, vel_N, vel_T -class LabeledDynamics(BaseDynamics): - def calculate_vel_U(self, vel, U, S, N, T): + +class OneShotDynamics(LabeledDynamics): + def _calculate_vel_U(self, vel, U, S, N, T): return vel.vel_u(U) - def calculate_vel_S(self, vel, U, S, N, T): + def _calculate_vel_S(self, vel, U, S, N, T): return vel.vel_s(U, S) - def calculate_vel_N(self, vel, U, S, N, T): + def _calculate_vel_N(self, vel, U, S, N, T): return vel.vel_u(N) - def calculate_vel_T(self, vel, U, S, N, T): - return vel.vel_s(N, T - N) + def _calculate_vel_T(self, vel, U, S, N, T): + return vel.vel_s(N, T - N) if self.has_splicing else vel.vel_u(T) - def calc_extra_parameters(self): - pass - def calculate_velocity(self, subset_adata): - U, S = get_U_S_for_velocity_estimation( - subset_adata, - self.use_smoothed, - self.has_splicing, - self.has_labeling, - self.log_unnormalized, - False, - ) - N, T = get_U_S_for_velocity_estimation( - subset_adata, - self.use_smoothed, - self.has_splicing, - self.has_labeling, - self.log_unnormalized, - True, - ) - if self.assumption_mRNA.lower() == "ss" or (self.experiment_type.lower() in ["one-shot", "mix_std_stm"]): - vel = Velocity(estimation=self.est) - elif self.assumption_mRNA.lower() == "kinetic": - params = {"alpha": self.alpha, "beta": self.beta, "gamma": self.gamma, "t": self.t} - vel = Velocity(**params) - else: - main_warning("Not implemented yet.") +class SSKineticsDynamics(LabeledDynamics): + def _calculate_vel_U(self, vel, U, S, N, T): + return N.multiply(csr_matrix(self.gamma_ / self.Kc)) - csr_matrix(self.beta).multiply(U) - self.calc_extra_parameters() - if self.has_splicing: - vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) - vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) - else: - vel_U = np.nan - vel_S = np.nan - vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) - vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) - vel_P = self.calculate_vel_P(vel=vel, U=U, S=S, N=N, T=T) + def _calculate_vel_S(self, vel, U, S, N, T): + return vel.vel_s(U, S) - return vel_U, vel_S, vel_N, vel_T, vel_P + def _calculate_vel_N(self, vel, U, S, N, T): + return (N - csr_matrix(self.Kc).multiply(N)).multiply(csr_matrix(self.gamma_ / self.Kc)) + def _calculate_vel_T(self, vel, U, S, N, T): + return (N - csr_matrix(self.Kc).multiply(T)).multiply(csr_matrix(self.gamma_ / self.Kc)) -class SSKineticsDynamics(LabeledDynamics): - def calc_extra_parameters(self): + def _calculate_velocity(self, vel, U, S, N, T): self.Kc = np.clip(self.gamma[:, None], 0, 1 - 1e-3) # S - U slope self.gamma_ = -(np.log(1 - self.Kc) / self.t[None, :]) # actual gamma + if self.has_splicing: + vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + else: + vel_U, vel_S = np.nan, np.nan + vel_N = self._calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self._calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + return vel_U, vel_S, vel_N, vel_T - def calculate_vel_U(self, vel, U, S, N, T): - return N.multiply(csr_matrix(self.gamma_ / self.Kc)) - csr_matrix(self.beta).multiply(U) # vel.vel_s(U_) - def calculate_vel_N(self, vel, U, S, N, T): - return (N - csr_matrix(self.Kc).multiply(N)).multiply(csr_matrix(self.gamma_ / self.Kc)) +class KineticsDynamics(LabeledDynamics): + def _calculate_vel_U(self, vel, U, S, N, T): + return vel.vel_u(U) - def calculate_vel_S(self, vel, U, S, N, T): - return (N - csr_matrix(self.Kc).multiply(T)).multiply(csr_matrix(self.gamma_ / self.Kc)) + def _calculate_vel_S(self, vel, U, S, N, T): + return vel.vel_s(U, S) + + def _calculate_vel_N(self, vel, U, S, N, T): + return vel.vel_u(N) + + def _calculate_vel_T(self, vel, U, S, N, T): + return vel.vel_u(T) + + def _calculate_velocity(self, vel, U, S, N, T): + if self.has_splicing: + vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + vel.parameters["beta"] = self.gamma + else: + vel_U, vel_S = np.nan, np.nan + alpha_ = one_shot_alpha_matrix(T, self.gamma, self.t) + vel.parameters["alpha"] = alpha_ + vel_N = self._calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self._calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + return vel_U, vel_S, vel_N, vel_T # TODO: rename this later From 366926aa024bc1c93bd555b51c2769ad51c28c93 Mon Sep 17 00:00:00 2001 From: sichao Date: Tue, 13 Jun 2023 15:32:54 -0400 Subject: [PATCH 09/31] implement existing mix methods --- dynamo/tools/dynamics.py | 76 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 5e70163d4..1e4c62750 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -625,6 +625,82 @@ def _calculate_velocity(self, vel, U, S, N, T): return vel_U, vel_S, vel_N, vel_T +class DegradationDynamics(LabeledDynamics): + def _calculate_vel_U(self, vel, U, S, N, T): + return np.nan + + def _calculate_vel_S(self, vel, U, S, N, T): + return vel.vel_s(U, S) + + def _calculate_vel_N(self, vel, U, S, N, T): + return np.nan + + def _calculate_vel_T(self, vel, U, S, N, T): + return np.nan + + +class MixStdSdmDynamics(LabeledDynamics): + def _calculate_vel_U(self, vel, U, S, N, T): + return self.alpha1 - csr_matrix(self.beta[:, None]).multiply(U) + + def _calculate_vel_S(self, vel, U, S, N, T): + return vel.vel_s(U, S) + + def _calculate_vel_N(self, vel, U, S, N, T): + return self.alpha1 - csr_matrix(self.gamma[:, None]).multiply(self.u_new) + + def _calculate_vel_T(self, vel, U, S, N, T): + return self.alpha1 - csr_matrix(self.gamma[:, None]).multiply(T) + + def _calculate_velocity(self, vel, U, S, N, T): + if self.has_splicing: + u0, self.u_new, self.alpha1 = solve_alpha_2p_mat( + t0=np.max(self.t) - self.t, + t1=self.t, + alpha0=self.alpha[0], + beta=self.beta, + u1=N, + ) + vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + else: + u0, self.u_new, self.alpha1 = solve_alpha_2p_mat( + t0=np.max(self.t) - self.t, + t1=self.t, + alpha0=self.alpha[0], + beta=self.gamma, + u1=N, + ) + vel_U, vel_S = np.nan, np.nan + vel_N = self._calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self._calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + return vel_U, vel_S, vel_N, vel_T + + +class MixKineticsDynamics(LabeledDynamics): + def _calculate_vel_U(self, vel, U, S, N, T): + return vel.vel_u(U, repeat=True) + def _calculate_vel_S(self, vel, U, S, N, T): + return vel.vel_s(U, S) + + def _calculate_vel_N(self, vel, U, S, N, T): + return vel.vel_u(N, repeat=True) + + def _calculate_vel_T(self, vel, U, S, N, T): + return vel.vel_u(T) if not self.has_splicing and self.NTR_vel else vel.vel_u(T, repeat=True) + + def _calculate_velocity(self, vel, U, S, N, T): + if self.has_splicing: + vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + vel.parameters["beta"] = self.gamma + else: + vel_U, vel_S = np.nan, np.nan + vel_N = self._calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self._calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + return vel_U, vel_S, vel_N, vel_T + + # TODO: rename this later def dynamics_wrapper( adata: AnnData, From 762039a3aebee0ad64dfc41bac81b38e7abe7e41 Mon Sep 17 00:00:00 2001 From: sichao Date: Tue, 13 Jun 2023 16:27:28 -0400 Subject: [PATCH 10/31] finalize wrapper --- dynamo/tools/dynamics.py | 205 +++++++++++++++++++-------------------- dynamo/tools/utils.py | 66 +++++++++++++ 2 files changed, 166 insertions(+), 105 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 1e4c62750..9d82129d3 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -38,6 +38,7 @@ prepare_data_no_splicing, ) from .utils import ( + get_auto_assump_mRNA, get_data_for_kin_params_estimation, get_U_S_for_velocity_estimation, get_valid_bools, @@ -53,69 +54,41 @@ class BaseDynamics: - def __init__( - self, - adata: AnnData, - filter_gene_mode: Literal["final", "basic", "no"] = "final", - use_smoothed: bool = True, - assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", - assumption_protein: Literal["ss"] = "ss", - model: Literal["auto", "deterministic", "stochastic"] = "auto", - model_was_auto: bool = True, - experiment_type: str = None, - has_splicing: bool = True, - has_labeling: bool = False, - splicing_labeling: bool = False, - has_protein: bool = False, - est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", - NTR_vel: bool = False, - group: Optional[str] = None, - protein_names: Optional[List[str]] = None, - concat_data: bool = False, - log_unnormalized: bool = True, - one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", - fraction_for_deg: bool = False, - re_smooth: bool = False, - sanity_check: bool = False, - del_2nd_moments: Optional[bool] = None, - cores: int = 1, - tkey: str = None, - **est_kwargs, - ): - self.adata = adata - self.filter_gene_mode = filter_gene_mode - self.use_smoothed = use_smoothed - self.assumption_mRNA = assumption_mRNA - self.assumption_protein = assumption_protein - self.model = model - self.model_was_auto = model_was_auto - self.experiment_type = experiment_type - self.has_splicing = has_splicing - self.has_labeling = has_labeling - self.splicing_labeling = splicing_labeling - self.has_protein = has_protein - self.est_method = est_method - self.NTR_vel = NTR_vel - self.group = group - self.protein_names = protein_names - self.concat_data = concat_data - self.log_unnormalized = log_unnormalized - self.one_shot_method = one_shot_method - self.fraction_for_deg = fraction_for_deg - self.re_smooth = re_smooth - self.sanity_check = sanity_check + def __init__(self, dynamics_kwargs): + self.adata = dynamics_kwargs["adata"] + self.filter_gene_mode = dynamics_kwargs["filter_gene_mode"] + self.use_smoothed = dynamics_kwargs["use_smoothed"] + self.assumption_mRNA = dynamics_kwargs["assumption_mRNA"] + self.assumption_protein = dynamics_kwargs["assumption_protein"] + self.model = dynamics_kwargs["model"] + self.model_was_auto = dynamics_kwargs["model_was_auto"] + self.experiment_type = dynamics_kwargs["experiment_type"] + self.has_splicing = dynamics_kwargs["has_splicing"] + self.has_labeling = dynamics_kwargs["has_labeling"] + self.splicing_labeling = dynamics_kwargs["splicing_labeling"] + self.has_protein = dynamics_kwargs["has_protein"] + self.est_method = dynamics_kwargs["est_method"] + self.NTR_vel = dynamics_kwargs["NTR_vel"] + self.group = dynamics_kwargs["group"] + self.protein_names = dynamics_kwargs["protein_names"] + self.concat_data = dynamics_kwargs["concat_data"] + self.log_unnormalized = dynamics_kwargs["log_unnormalized"] + self.one_shot_method = dynamics_kwargs["one_shot_method"] + self.fraction_for_deg = dynamics_kwargs["fraction_for_deg"] + self.re_smooth = dynamics_kwargs["re_smooth"] + self.sanity_check = dynamics_kwargs["sanity_check"] self.del_2nd_moments = DynamoAdataConfig.use_default_var_if_none( - del_2nd_moments, DynamoAdataConfig.DYNAMICS_DEL_2ND_MOMENTS_KEY + dynamics_kwargs["del_2nd_moments"], DynamoAdataConfig.DYNAMICS_DEL_2ND_MOMENTS_KEY ) - self.cores = cores - if tkey is not None: - if adata.obs[tkey].max() > 60: + self.cores = dynamics_kwargs["cores"] + if dynamics_kwargs["tkey"] is not None: + if dynamics_kwargs["adata"].obs[dynamics_kwargs["tkey"]].max() > 60: main_warning( "Looks like you are using minutes as the time unit. For the purpose of numeric stability, " "we recommend using hour as the time unit." ) - self.tkey = adata.uns["pp"]["tkey"] if tkey is None else tkey - self.est_kwargs = est_kwargs + self.tkey = self.adata.uns["pp"]["tkey"] if dynamics_kwargs["tkey"] is None else dynamics_kwargs["tkey"] + self.est_kwargs = dynamics_kwargs["est_kwargs"] def _estimate_params_ss(self, subset_adata, **est_params_args): if self.est_method.lower() == "auto": @@ -532,6 +505,7 @@ def _calculate_velocity(self, vel, U, S, N, T): vel_S = vel.vel_s(U, S) vel_N = np.nan vel_T = np.nan + return vel_U, vel_S, vel_N, vel_T class LabeledDynamics(BaseDynamics): @@ -639,7 +613,7 @@ def _calculate_vel_T(self, vel, U, S, N, T): return np.nan -class MixStdSdmDynamics(LabeledDynamics): +class MixStdStmDynamics(LabeledDynamics): def _calculate_vel_U(self, vel, U, S, N, T): return self.alpha1 - csr_matrix(self.beta[:, None]).multiply(U) @@ -651,7 +625,7 @@ def _calculate_vel_N(self, vel, U, S, N, T): def _calculate_vel_T(self, vel, U, S, N, T): return self.alpha1 - csr_matrix(self.gamma[:, None]).multiply(T) - + def _calculate_velocity(self, vel, U, S, N, T): if self.has_splicing: u0, self.u_new, self.alpha1 = solve_alpha_2p_mat( @@ -680,6 +654,7 @@ def _calculate_velocity(self, vel, U, S, N, T): class MixKineticsDynamics(LabeledDynamics): def _calculate_vel_U(self, vel, U, S, N, T): return vel.vel_u(U, repeat=True) + def _calculate_vel_S(self, vel, U, S, N, T): return vel.vel_s(U, S) @@ -704,9 +679,25 @@ def _calculate_velocity(self, vel, U, S, N, T): # TODO: rename this later def dynamics_wrapper( adata: AnnData, + filter_gene_mode: Literal["final", "basic", "no"] = "final", + use_smoothed: bool = True, assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", + assumption_protein: Literal["ss"] = "ss", model: Literal["auto", "deterministic", "stochastic"] = "auto", - **kwargs, + est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", + NTR_vel: bool = False, + group: Optional[str] = None, + protein_names: Optional[List[str]] = None, + concat_data: bool = False, + log_unnormalized: bool = True, + one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", + fraction_for_deg: bool = False, + re_smooth: bool = False, + sanity_check: bool = False, + del_2nd_moments: Optional[bool] = None, + cores: int = 1, + tkey: str = None, + **est_kwargs, ) -> AnnData: """Run corresponding Dynamics methods according to the parameters.""" if "pp" not in adata.uns_keys(): @@ -726,29 +717,13 @@ def dynamics_wrapper( adata.uns["pp"]["has_protein"], ) - ( - _, - _, - _, - _, - _, - _, - _, - _, - _, - _, - _, - assump_mRNA, - ) = get_data_for_kin_params_estimation( - adata, - has_splicing, - has_labeling, - model, - kwargs["use_smoothed"], - kwargs["tkey"], - kwargs["protein_names"], - kwargs["log_unnormalized"], - kwargs["NTR_vel"], + (NTR_vel, assump_mRNA) = get_auto_assump_mRNA( + subset_adata=adata, + has_splicing=has_splicing, + has_labeling=has_labeling, + use_moments=use_smoothed, + tkey=tkey, + NTR_vel=NTR_vel, ) if assumption_mRNA.lower() == "auto": assumption_mRNA = assump_mRNA @@ -779,30 +754,50 @@ def dynamics_wrapper( ]: model = "deterministic" + dynamics_kwargs = { + "adata": adata, + "filter_gene_mode": filter_gene_mode, + "use_smoothed": use_smoothed, + "assumption_mRNA": assumption_mRNA, + "assumption_protein": assumption_protein, + "model": model, + "model_was_auto": model_was_auto, + "experiment_type": experiment_type, + "has_splicing": has_splicing, + "has_labeling": has_labeling, + "splicing_labeling": splicing_labeling, + "has_protein": has_protein, + "est_method": est_method, + "NTR_vel": NTR_vel, + "group": group, + "protein_names": protein_names, + "concat_data": concat_data, + "log_unnormalized": log_unnormalized, + "one_shot_method": one_shot_method, + "fraction_for_deg": fraction_for_deg, + "re_smooth": re_smooth, + "sanity_check": sanity_check, + "del_2nd_moments": del_2nd_moments, + "cores": cores, + "tkey": tkey, + "est_kwargs": est_kwargs, + } + if experiment_type == "conventional": - estimator = SplicedDynamics( - adata=adata, - assumption_mRNA=assumption_mRNA, - has_splicing=has_splicing, - has_labeling=has_labeling, - splicing_labeling=splicing_labeling, - has_protein=has_protein, - model=model, - model_was_auto=model_was_auto, - **kwargs, - ) + estimator = SplicedDynamics(dynamics_kwargs) elif experiment_type in ["one-shot", "one_shot"]: - estimator = SplicedDynamics( - adata=adata, - assumption_mRNA=assumption_mRNA, - has_splicing=has_splicing, - has_labeling=has_labeling, - splicing_labeling=splicing_labeling, - has_protein=has_protein, - model=model, - model_was_auto=model_was_auto, - **kwargs, - ) + estimator = SplicedDynamics(dynamics_kwargs) + elif experiment_type == "kin": + if assumption_mRNA == "ss": + estimator = SSKineticsDynamics(dynamics_kwargs) + elif assumption_mRNA == "kin": + estimator = KineticsDynamics(dynamics_kwargs) + elif experiment_type == "deg": + estimator = DegradationDynamics(dynamics_kwargs) + elif experiment_type == "mix_std_stm": + estimator = MixStdStmDynamics(dynamics_kwargs) + elif experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: + estimator = MixKineticsDynamics(dynamics_kwargs) else: raise NotImplementedError("This method has not been implemented.") estimator.estimate() diff --git a/dynamo/tools/utils.py b/dynamo/tools/utils.py index 6c84d8247..a5b1e6707 100755 --- a/dynamo/tools/utils.py +++ b/dynamo/tools/utils.py @@ -1126,6 +1126,72 @@ def log_unnormalized_data( return raw +def get_auto_assump_mRNA( + subset_adata, + has_splicing, + has_labeling, + use_moments, + tkey, + NTR_vel, +): + if not NTR_vel: + if has_labeling and not has_splicing: + main_warning( + "Your adata only has labeling data, but `NTR_vel` is set to be " + "`False`. Dynamo will reset it to `True` to enable this analysis." + ) + NTR_vel = True + + normalized, assumption_mRNA = ( + False, + None, + ) + mapper = get_mapper() + + # labeling plus splicing + if np.all(([i in subset_adata.layers.keys() for i in ["X_ul", "X_sl", "X_su"]])) or np.all( + ([mapper[i] in subset_adata.layers.keys() for i in ["X_ul", "X_sl", "X_su"]]) + ): # only uu, ul, su, sl provided + normalized, assumption_mRNA = ( + True, + "ss" if NTR_vel else "kinetic", + ) + + elif np.all(([i in subset_adata.layers.keys() for i in ["uu", "ul", "sl", "su"]])): + normalized, assumption_mRNA = ( + False, + "ss" if NTR_vel else "kinetic", + ) + # labeling without splicing + if not has_splicing and ( + ("X_new" in subset_adata.layers.keys() and not use_moments) + or (mapper["X_new"] in subset_adata.layers.keys() and use_moments) + ): # run new / total ratio (NTR) + normalized, assumption_mRNA = ( + True, + "ss" if NTR_vel else "kinetic", + ) + elif not has_splicing and "new" in subset_adata.layers.keys(): + assumption_mRNA = ("ss" if NTR_vel else "kinetic",) + # splicing data + if not has_labeling and ( + ("X_unspliced" in subset_adata.layers.keys() and not use_moments) + or (mapper["X_unspliced"] in subset_adata.layers.keys() and use_moments) + ): + normalized, assumption_mRNA = ( + True, + "kinetic" if tkey in subset_adata.obs.columns else "ss", + ) + elif not has_labeling and "unspliced" in subset_adata.layers.keys(): + assumption_mRNA = "kinetic" if tkey in subset_adata.obs.columns else "ss" + + if has_labeling: + if assumption_mRNA is None: + assumption_mRNA = "ss" if NTR_vel else "kinetic" + + return NTR_vel, assumption_mRNA + + def get_data_for_kin_params_estimation( subset_adata, has_splicing, From 6090979b00035a269c9d7b6276875ec2db850a38 Mon Sep 17 00:00:00 2001 From: sichao Date: Tue, 13 Jun 2023 17:15:57 -0400 Subject: [PATCH 11/31] debug wrapper and kin --- dynamo/tools/dynamics.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 9d82129d3..651816817 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -592,7 +592,7 @@ def _calculate_velocity(self, vel, U, S, N, T): vel.parameters["beta"] = self.gamma else: vel_U, vel_S = np.nan, np.nan - alpha_ = one_shot_alpha_matrix(T, self.gamma, self.t) + alpha_ = one_shot_alpha_matrix(N, self.gamma, self.t) vel.parameters["alpha"] = alpha_ vel_N = self._calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) vel_T = self._calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) @@ -786,12 +786,14 @@ def dynamics_wrapper( if experiment_type == "conventional": estimator = SplicedDynamics(dynamics_kwargs) elif experiment_type in ["one-shot", "one_shot"]: - estimator = SplicedDynamics(dynamics_kwargs) + estimator = OneShotDynamics(dynamics_kwargs) elif experiment_type == "kin": if assumption_mRNA == "ss": estimator = SSKineticsDynamics(dynamics_kwargs) - elif assumption_mRNA == "kin": + elif assumption_mRNA == "kinetic": estimator = KineticsDynamics(dynamics_kwargs) + else: + raise NotImplementedError("This method has not been implemented.") elif experiment_type == "deg": estimator = DegradationDynamics(dynamics_kwargs) elif experiment_type == "mix_std_stm": From 35a3de427d851654fcc6f0c587b214435d60e2c8 Mon Sep 17 00:00:00 2001 From: sichao Date: Wed, 14 Jun 2023 14:28:57 -0400 Subject: [PATCH 12/31] add base docstr and all typing --- dynamo/tools/dynamics.py | 393 ++++++++++++++++++++++++++++++++++----- 1 file changed, 348 insertions(+), 45 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 651816817..023270a99 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd from anndata import AnnData +from numpy import ndarray from scipy.sparse import SparseEfficiencyWarning, csr_matrix, issparse from tqdm import tqdm @@ -54,7 +55,9 @@ class BaseDynamics: - def __init__(self, dynamics_kwargs): + """The base class for the inclusive model of expression dynamics considers splicing, metabolic labeling and protein + translation.""" + def __init__(self, dynamics_kwargs: Dict): self.adata = dynamics_kwargs["adata"] self.filter_gene_mode = dynamics_kwargs["filter_gene_mode"] self.use_smoothed = dynamics_kwargs["use_smoothed"] @@ -90,7 +93,8 @@ def __init__(self, dynamics_kwargs): self.tkey = self.adata.uns["pp"]["tkey"] if dynamics_kwargs["tkey"] is None else dynamics_kwargs["tkey"] self.est_kwargs = dynamics_kwargs["est_kwargs"] - def _estimate_params_ss(self, subset_adata, **est_params_args): + def _estimate_params_ss(self, subset_adata: AnnData, **est_params_args): + """Estimate velocity parameters with steady state mRNA assumption.""" if self.est_method.lower() == "auto": self.est_method = "gmm" if self.model.lower() == "stochastic" else "ols" @@ -134,7 +138,8 @@ def _estimate_params_ss(self, subset_adata, **est_params_args): self.alpha, self.beta, self.gamma, self.eta, self.delta = self.est.parameters.values() - def _estimate_params_kin(self, cur_grp_i, cur_grp, subset_adata, **est_params_args): + def _estimate_params_kin(self, cur_grp_i: int, cur_grp: str, subset_adata: AnnData, **est_params_args): + """Estimate velocity parameters with kinetic mRNA assumption.""" return_ntr = True if self.fraction_for_deg and self.experiment_type.lower() == "deg" else False if self.model_was_auto and self.experiment_type.lower() == "kin": @@ -201,7 +206,9 @@ def _estimate_params_kin(self, cur_grp_i, cur_grp, subset_adata, **est_params_ar self.kin_extra_params = params.loc[:, params.columns.difference(all_kinetic_params)] - def estimate_parameters(self, cur_grp_i, cur_grp, subset_adata, **est_params_args): + def estimate_parameters(self, cur_grp_i: int, cur_grp: str, subset_adata: AnnData, **est_params_args): + """Wrapper to call corresponding parameters estimation functions according to assumptions. Override this in the + subclass if the class doesn't use ss_estimation or kinetic_model to estimate.""" if self.assumption_mRNA.lower() == "ss" or (self.experiment_type.lower() in ["one-shot", "mix_std_stm"]): self._estimate_params_ss(subset_adata=subset_adata, **est_params_args) elif self.assumption_mRNA.lower() == "kinetic": @@ -209,7 +216,21 @@ def estimate_parameters(self, cur_grp_i, cur_grp, subset_adata, **est_params_arg else: main_warning("Not implemented yet.") - def set_velocity(self, vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_bools, valid_bools_, kin_param_pre, **set_velo_args): + def set_velocity( + self, + vel_U: Union[ndarray, csr_matrix], + vel_S: Union[ndarray, csr_matrix], + vel_N: Union[ndarray, csr_matrix], + vel_T: Union[ndarray, csr_matrix], + vel_P: Union[ndarray, csr_matrix], + cur_grp: int, + cur_cells_bools: ndarray, + valid_bools_: ndarray, + kin_param_pre: str, + **set_velo_args, + ): + """Save the calculated parameters and velocity to anndata. Override this in the subclass if the class has a + different assumption.""" if self.assumption_mRNA.lower() == "ss" or (self.experiment_type.lower() in ["one-shot", "mix_std_stm"]): self.adata = set_velocity( self.adata, @@ -276,13 +297,41 @@ def set_velocity(self, vel_U, vel_S, vel_N, vel_T, vel_P, cur_grp, cur_cells_boo else: main_warning("Not implemented yet.") - def _calculate_velocity(self, vel, U, S, N, T): + def _calculate_velocity( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Tuple: + """The core function to calculate the RNA velocity. Every subclass needs to implement this function. + + Args: + vel: the Velocity object to calculate the velocity. + U: the matrix representing unspliced layer. + S: the matrix representing spliced layer. + N: the matrix representing new layer in metabolic labeling. + T: the matrix representing total layer in metabolic labeling. + + Returns: + The velocity matrix for unspliced, spliced, new and total layers. + """ raise NotImplementedError("This method has not been implemented.") - def _calculate_vel_P(self, vel, U, S, N, T): + def _calculate_vel_P( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + """Calculate the protein velocity.""" return vel.vel_p(T, self.P) if self.NTR_vel else vel.vel_p(S, self.P) - def calculate_velocity(self, subset_adata): + def calculate_velocity(self, subset_adata: AnnData) -> Tuple: + """Read the U, S, N, T matrix, create the Velocity class and call the velocity calculation function.""" U, S = get_U_S_for_velocity_estimation( subset_adata, self.use_smoothed, @@ -312,7 +361,8 @@ def calculate_velocity(self, subset_adata): return vel_U, vel_S, vel_N, vel_T, vel_P - def _filter(self): + def _filter(self) -> Tuple: + """Get filter bools based on existing filter in AnnData.""" filter_list, filter_gene_mode_list = ( [ "use_for_pca", @@ -334,7 +384,8 @@ def _filter(self): raise Exception(f"no genes pass filter. Try resetting `filter_gene_mode = 'no'` to use all genes.") return filter_gene_mode, valid_bools, gene_num - def _smooth(self, valid_bools): + def _smooth(self, valid_bools: ndarray): + """Smooth the data by moments when necessary.""" M_layers = [i for i in self.adata.layers.keys() if i.startswith("M_")] if len(M_layers) < 2 or self.re_smooth: @@ -354,7 +405,15 @@ def _smooth(self, valid_bools): f"performed. Try setting re_smooth = True if not sure." ) - def _sanity_check(self, valid_bools, valid_bools_, gene_num, subset_adata, kin_param_pre): + def _sanity_check( + self, + valid_bools: ndarray, + valid_bools_: ndarray, + gene_num: int, + subset_adata: AnnData, + kin_param_pre: str, + ) -> Tuple: + """Perform sanity check by checking the slope for kinetic or degradation metabolic labeling experiments.""" indices_valid_bools = np.where(valid_bools)[0] self.t, L = ( self.t.flatten(), @@ -395,6 +454,12 @@ def _sanity_check(self, valid_bools, valid_bools_, gene_num, subset_adata, kin_p return subset_adata, valid_bools_ def estimate(self): + """Main function to estimate the RNA dynamics. + + The function initially conducts filtering, smoothing, and sanity checks to ensure data quality. Subsequently, it + calls the corresponding functions to estimate parameters and compute velocity. Lastly, it updates the AnnData + object and save all results. + """ self.X_data, self.X_fit_data = None, None filter_gene_mode, valid_bools, gene_num = self._filter() @@ -500,7 +565,14 @@ def estimate(self): class SplicedDynamics(BaseDynamics): - def _calculate_velocity(self, vel, U, S, N, T): + def _calculate_velocity( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Tuple: vel_U = vel.vel_u(U) vel_S = vel.vel_s(U, S) vel_N = np.nan @@ -509,19 +581,54 @@ def _calculate_velocity(self, vel, U, S, N, T): class LabeledDynamics(BaseDynamics): - def _calculate_vel_U(self, vel, U, S, N, T): + def _calculate_vel_U( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: raise NotImplementedError("This method has not been implemented.") - def _calculate_vel_S(self, vel, U, S, N, T): + def _calculate_vel_S( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: raise NotImplementedError("This method has not been implemented.") - def _calculate_vel_N(self, vel, U, S, N, T): + def _calculate_vel_N( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: raise NotImplementedError("This method has not been implemented.") - def _calculate_vel_T(self, vel, U, S, N, T): + def _calculate_vel_T( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: raise NotImplementedError("This method has not been implemented.") - def _calculate_velocity(self, vel, U, S, N, T): + def _calculate_velocity( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Tuple: if self.has_splicing: vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) @@ -533,33 +640,96 @@ def _calculate_velocity(self, vel, U, S, N, T): class OneShotDynamics(LabeledDynamics): - def _calculate_vel_U(self, vel, U, S, N, T): + def _calculate_vel_U( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_u(U) - def _calculate_vel_S(self, vel, U, S, N, T): + def _calculate_vel_S( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) - def _calculate_vel_N(self, vel, U, S, N, T): + def _calculate_vel_N( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_u(N) - def _calculate_vel_T(self, vel, U, S, N, T): + def _calculate_vel_T( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_s(N, T - N) if self.has_splicing else vel.vel_u(T) class SSKineticsDynamics(LabeledDynamics): - def _calculate_vel_U(self, vel, U, S, N, T): + def _calculate_vel_U( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return N.multiply(csr_matrix(self.gamma_ / self.Kc)) - csr_matrix(self.beta).multiply(U) - def _calculate_vel_S(self, vel, U, S, N, T): + def _calculate_vel_S( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) - def _calculate_vel_N(self, vel, U, S, N, T): + def _calculate_vel_N( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return (N - csr_matrix(self.Kc).multiply(N)).multiply(csr_matrix(self.gamma_ / self.Kc)) - def _calculate_vel_T(self, vel, U, S, N, T): + def _calculate_vel_T( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return (N - csr_matrix(self.Kc).multiply(T)).multiply(csr_matrix(self.gamma_ / self.Kc)) - def _calculate_velocity(self, vel, U, S, N, T): + def _calculate_velocity( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Tuple: self.Kc = np.clip(self.gamma[:, None], 0, 1 - 1e-3) # S - U slope self.gamma_ = -(np.log(1 - self.Kc) / self.t[None, :]) # actual gamma if self.has_splicing: @@ -573,19 +743,54 @@ def _calculate_velocity(self, vel, U, S, N, T): class KineticsDynamics(LabeledDynamics): - def _calculate_vel_U(self, vel, U, S, N, T): + def _calculate_vel_U( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_u(U) - def _calculate_vel_S(self, vel, U, S, N, T): + def _calculate_vel_S( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) - def _calculate_vel_N(self, vel, U, S, N, T): + def _calculate_vel_N( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_u(N) - def _calculate_vel_T(self, vel, U, S, N, T): + def _calculate_vel_T( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_u(T) - def _calculate_velocity(self, vel, U, S, N, T): + def _calculate_velocity( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Tuple: if self.has_splicing: vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) @@ -600,33 +805,96 @@ def _calculate_velocity(self, vel, U, S, N, T): class DegradationDynamics(LabeledDynamics): - def _calculate_vel_U(self, vel, U, S, N, T): + def _calculate_vel_U( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return np.nan - def _calculate_vel_S(self, vel, U, S, N, T): + def _calculate_vel_S( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) - def _calculate_vel_N(self, vel, U, S, N, T): + def _calculate_vel_N( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return np.nan - def _calculate_vel_T(self, vel, U, S, N, T): + def _calculate_vel_T( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return np.nan class MixStdStmDynamics(LabeledDynamics): - def _calculate_vel_U(self, vel, U, S, N, T): + def _calculate_vel_U( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return self.alpha1 - csr_matrix(self.beta[:, None]).multiply(U) - def _calculate_vel_S(self, vel, U, S, N, T): + def _calculate_vel_S( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) - def _calculate_vel_N(self, vel, U, S, N, T): + def _calculate_vel_N( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return self.alpha1 - csr_matrix(self.gamma[:, None]).multiply(self.u_new) - def _calculate_vel_T(self, vel, U, S, N, T): + def _calculate_vel_T( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return self.alpha1 - csr_matrix(self.gamma[:, None]).multiply(T) - def _calculate_velocity(self, vel, U, S, N, T): + def _calculate_velocity( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Tuple: if self.has_splicing: u0, self.u_new, self.alpha1 = solve_alpha_2p_mat( t0=np.max(self.t) - self.t, @@ -652,19 +920,54 @@ def _calculate_velocity(self, vel, U, S, N, T): class MixKineticsDynamics(LabeledDynamics): - def _calculate_vel_U(self, vel, U, S, N, T): + def _calculate_vel_U( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_u(U, repeat=True) - def _calculate_vel_S(self, vel, U, S, N, T): + def _calculate_vel_S( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) - def _calculate_vel_N(self, vel, U, S, N, T): + def _calculate_vel_N( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_u(N, repeat=True) - def _calculate_vel_T(self, vel, U, S, N, T): + def _calculate_vel_T( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: return vel.vel_u(T) if not self.has_splicing and self.NTR_vel else vel.vel_u(T, repeat=True) - def _calculate_velocity(self, vel, U, S, N, T): + def _calculate_velocity( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Tuple: if self.has_splicing: vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) From c0e2c94d31ab75267be5541df0081178ee846c7f Mon Sep 17 00:00:00 2001 From: sichao Date: Wed, 14 Jun 2023 16:34:50 -0400 Subject: [PATCH 13/31] add docstr for subclass --- dynamo/tools/dynamics.py | 238 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 236 insertions(+), 2 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 023270a99..2b791aef8 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -56,7 +56,216 @@ class BaseDynamics: """The base class for the inclusive model of expression dynamics considers splicing, metabolic labeling and protein - translation.""" + translation. + + The function supports learning high-dimensional velocity vector samples for droplet based (10x, inDrop, drop-seq, + etc), scSLAM-seq, NASC-seq sci-fate, scNT-seq, scEU-seq, cite-seq or REAP-seq datasets. + + Args: + adata: an AnnData object. + filter_gene_mode: The string for indicating which mode of gene filter will be used. Defaults to "final". + use_smoothed: whether to use the smoothed data when estimating kinetic parameters and calculating velocity for + each gene. When you have time-series data (`tkey` is not None), we recommend to smooth data among cells from + each time point. Defaults to True. + assumption_mRNA: Parameter estimation assumption for mRNA. Available options are: + (1) 'ss': pseudo steady state; + (2) 'kinetic' or None: degradation and kinetic data without steady state assumption. + (3) 'auto': dynamo will choose a reasonable assumption of the system under study automatically. + If no labelling data exists, assumption_mRNA will automatically set to be 'ss'. For one-shot experiment, + assumption_mRNA is set to be None. However we will use steady state assumption to estimate parameters alpha + and gamma either by a deterministic linear regression or the first order decay approach in line of the + sci-fate paper; + Defaults to "auto". + assumption_protein: Parameter estimation assumption for protein. Available options are: + (1) 'ss': pseudo steady state; + Defaults to "ss". + model: String indicates which estimation model will be used. + Available options are: + (1) 'deterministic': The method based on `deterministic` ordinary differential equations; + (2) 'stochastic' or `moment`: The new method from us that is based on `stochastic` master equations; + Note that `kinetic` model doesn't need to assumes the `experiment_type` is not `conventional`. As other + labeling experiments, if you specify the `tkey`, dynamo can also apply `kinetic` model on `conventional` + scRNA-seq datasets. A "model_selection" model will be supported soon in which alpha, beta and gamma will be + modeled as a function of time. + Defaults to "auto". + est_method: This parameter should be used in conjunction with `model` parameter. + Available options when the `model` is 'ss' include: + (1) 'ols': The canonical method or Ordinary Least Squares regression from the seminar RNA velocity paper + based on deterministic ordinary differential equations; + (2) 'rlm': The robust linear models from statsmodels. Robust Regression provides an alternative to OLS + regression by lowering the restrictions on assumptions and dampens the effect of outliers in order + to fit majority of the data. + (3) 'ransac': RANSAC (RANdom SAmple Consensus) algorithm for robust linear regression. RANSAC is an + iterative algorithm for the robust estimation of parameters from a subset of inliers from the + complete dataset. RANSAC implementation is based on RANSACRegressor function from sklearn package. + Note that if `rlm` or `ransac` failed, it will roll back to the `ols` method. In addition, `ols`, + `rlm` and `ransac` can be only used in conjunction with the `deterministic` model. + (4) 'gmm': The new generalized methods of moments from us that is based on master equations, similar to + the "moment" model in the excellent scVelo package; + (5) 'negbin': The new method from us that models steady state RNA expression as a negative binomial + distribution, also built upon on master equations. + (6) 'auto': dynamo will choose the suitable estimation method based on the `assumption_mRNA`, + `experiment_type` and `model` parameter. + Note that all those methods require using extreme data points (except negbin, which use all data points) for + estimation. Extreme data points are defined as the data from cells whose expression of unspliced / spliced + or new / total RNA, etc. are in the top or bottom, 5%, for example. `linear_regression` only considers the + mean of RNA species (based on the `deterministic` ordinary different equations) while moment based methods + (`gmm`, `negbin`) considers both first moment (mean) and second moment (uncentered variance) of RNA species + (based on the `stochastic` master equations). + The above method are all (generalized) linear regression based method. In order to return estimated + parameters (including RNA half-life), it additionally returns R-squared (either just for extreme data points + or all data points) as well as the log-likelihood of the fitting, which will be used for transition matrix + and velocity embedding. + Available options when the `assumption_mRNA` is 'kinetic' include: + (1) 'auto': dynamo will choose the suitable estimation method based on the `assumption_mRNA`, + `experiment_type` and `model` parameter. + (2) `twostep`: first for each time point, estimate K (1-e^{-rt}) using the total and new RNA data. Then + use regression via t-np.log(1-K) to get degradation rate gamma. When splicing and labeling data both + exist, replacing new/total with ul/u can be used to estimate beta. Suitable for velocity estimation. + (3) `direct` (default): method that directly uses the kinetic model to estimate rate parameters, + generally not good for velocity estimation. + Under `kinetic` model, choosing estimation is `experiment_type` dependent. For `kinetics` experiments, + dynamo supposes methods including RNA bursting or without RNA bursting. Dynamo also adaptively estimates + parameters, based on whether the data has splicing or without splicing. + Under `kinetic` assumption, the above method uses non-linear least square fitting. In order to return + estimated parameters (including RNA half-life), it additionally returns the log-likelihood of the + fitting, which will be used for transition matrix and velocity embedding. + All `est_method` uses least square to estimate optimal parameters with latin cubic sampler for initial + sampling. Defaults to "auto". + NTR_vel: whether to use NTR (new/total ratio) velocity for labeling datasets. Defaults to False. + group: the column key/name that identifies the grouping information (for example, clusters that correspond to + different cell types) of cells. This will be used to calculate 1/2 st moments and covariance for each cells + in each group. It will also enable estimating group-specific (i.e cell-type specific) kinetic parameters. + Defaults to None. + protein_names: a list of gene names corresponds to the rows of the measured proteins in the `X_protein` of the + `obsm` attribute. The names have to be included in the adata.var.index. Defaults to None. + concat_data: whether to concatenate data before estimation. If your data is a list of matrices for each time + point, this need to be set as True. Defaults to False. + log_unnormalized: whether to log transform the unnormalized data. Defaults to True. + one_shot_method: The method that will be used for estimating kinetic parameters for one-shot experiment data. + (1) the "sci-fate" method directly solves gamma with the first-order decay model; + (2) the "combined" model uses the linear regression under steady state to estimate relative gamma, and then + calculate absolute gamma (degradation rate), beta (splicing rate) and cell-wise alpha (transcription + rate). Defaults to "combined". + fraction_for_deg: whether to use the fraction of labeled RNA instead of the raw labeled RNA to estimate the + degradation parameter. Defaults to False. + re_smooth: whether to re-smooth the adata and also recalculate 1/2 moments or covariance. Defaults to False. + sanity_check: whether to perform sanity-check before estimating kinetic parameters and velocity vectors, + currently only applicable to kinetic or degradation metabolic labeling based scRNA-seq data. The basic idea + is that for kinetic (degradation) experiment, the total labelled RNA for each gene should increase + (decrease) over time. If they don't satisfy this criteria, those genes will be ignored during the + estimation. Defaults to False. + del_2nd_moments: whether to remove second moments or covariances. Default it is `False` so this avoids + recalculating 2nd moments or covariance but it may take a lot memory when your dataset is big. Set this to + `True` when your data is huge (like > 25, 000 cells or so) to reducing the memory footprint. Defaults to + None. + cores: number of cores to run the estimation. If cores is set to be > 1, multiprocessing will be used to + parallel the parameter estimation. Currently only applicable cases when assumption_mRNA is `ss` or cases + when experiment_type is either "one-shot" or "mix_std_stm". Defaults to 1. + tkey: the column key for the labeling time of cells in .obs. Used for labeling based scRNA-seq data. If `tkey` + is None, then `adata.uns["pp"]["tkey"]` will be checked and used if exists. Defaults to None. + **est_kwargs: Other arguments passed to the fit method (steady state models) or estimation methods (kinetic + models). + + Raises: + ValueError: preprocessing not performed. + Exception: No gene pass filter. + Exception: Too few valid genes. + + Returns: + An updated AnnData object with estimated kinetic parameters, inferred velocity and estimation related + information included. The estimated kinetic parameters are currently appended to .obs (should move to .obsm with + the key `dynamics` later). Depends on the estimation method, experiment type and whether you applied estimation + for each groups via `group`, the number of returned parameters can be variable. For conventional scRNA-seq + (including cite-seq or other types of protein/RNA coassays) and somethings metabolic labeling data, the + parameters will at mostly include: + alpha: Transcription rate + beta: Splicing rate + gamma: Spliced RNA degradation rate + eta: Translation rate (only applicable to RNA/protein coassay) + delta: Protein degradation rate (only applicable to RNA/protein coassay) + alpha_b: intercept of alpha fit + beta_b: intercept of beta fit + gamma_b: intercept of gamma fit + eta_b: intercept of eta fit (only applicable to RNA/protein coassay) + delta_b: intercept of delta fit (only applicable to RNA/protein coassay) + alpha_r2: r-squared for goodness of fit of alpha estimation + beta_r2: r-squared for goodness of fit of beta estimation + gamma_r2: r-squared for goodness of fit of gamma estimation + eta_r2: r-squared for goodness of fit of eta estimation (only applicable to RNA/protein coassay) + delta_r2: r-squared for goodness of fit of delta estimation (only applicable to RNA/protein coassay) + alpha_logLL: loglikelihood of alpha estimation (only applicable to stochastic model) + beta_loggLL: loglikelihood of beta estimation (only applicable to stochastic model) + gamma_logLL: loglikelihood of gamma estimation (only applicable to stochastic model) + eta_logLL: loglikelihood of eta estimation (only applicable to stochastic model and RNA/protein coassay) + delta_loggLL: loglikelihood of delta estimation (only applicable to stochastic model and RNA/protein + coassay) + uu0: estimated amount of unspliced unlabeled RNA at time 0 (only applicable to data with both splicing + and labeling) + ul0: estimated amount of unspliced labeled RNA at time 0 (only applicable to data with both splicing + and labeling) + su0: estimated amount of spliced unlabeled RNA at time 0 (only applicable to data with both splicing + and labeling) + sl0: estimated amount of spliced labeled RNA at time 0 (only applicable to data with both splicing and + labeling) + U0: estimated amount of unspliced RNA (uu + ul) at time 0 + S0: estimated amount of spliced (su + sl) RNA at time 0 + total0: estimated amount of spliced (U + S) RNA at time 0 + half_life: Spliced mRNA's half-life (log(2) / gamma) + + Note that all data points are used when estimating r2 although only extreme data points are used for + estimating r2. This is applicable to all estimation methods, either `linear_regression`, `gmm` or `negbin`. + By default we set the intercept to be 0. + + For metabolic labeling data, the kinetic parameters will at most include: + alpha: Transcription rate (effective - when RNA promoter switching considered) + beta: Splicing rate + gamma: Spliced RNA degradation rate + a: Switching rate from active promoter state to inactive promoter state + b: Switching rate from inactive promoter state to active promoter state + alpha_a: Transcription rate for active promoter + alpha_i: Transcription rate for inactive promoter + cost: cost of the kinetic parameters estimation + logLL: loglikelihood of kinetic parameters estimation + alpha_r2: r-squared for goodness of fit of alpha estimation + beta_r2: r-squared for goodness of fit of beta estimation + gamma_r2: r-squared for goodness of fit of gamma estimation + uu0: estimated amount of unspliced unlabeled RNA at time 0 (only applicable to data with both splicing + and labeling) + ul0: estimated amount of unspliced labeled RNA at time 0 (only applicable to data with both splicing + and labeling) + su0: estimated amount of spliced unlabeled RNA at time 0 (only applicable to data with both splicing + and labeling) + sl0: estimated amount of spliced labeled RNA at time 0 (only applicable to data with both splicing and + labeling) + u0: estimated amount of unspliced RNA (including uu, ul) at time 0 + s0: estimated amount of spliced (including su, sl) RNA at time 0 + total0: estimated amount of spliced (including U, S) RNA at time 0 + p_half_life: half-life for unspliced mRNA + half_life: half-life for spliced mRNA + + If sanity_check has performed, a column with key `sanity_check` will also included which indicates which + gene passes filter (`filter_gene_mode`) and sanity check. This is only applicable to kinetic and degradation + metabolic labeling experiments. + + In addition, the `dynamics` key of the .uns attribute corresponds to a dictionary that includes the + following keys: + t: An array like object that indicates the time point of each cell used during parameters estimation + (applicable only to kinetic models) + group: The group that you used to estimate parameters group-wise + X_data: The input that was used for estimating parameters (applicable only to kinetic models) + X_fit_data: The data that was fitted during parameters estimation (applicable only to kinetic models) + asspt_mRNA: Assumption of mRNA dynamics (steady state or kinetic) + experiment_type: Experiment type (either conventional or metabolic labeling based) + normalized: Whether to normalize data + model: Model used for the parameter estimation (either auto, deterministic or stochastic) + has_splicing: Does the adata has splicing? detected automatically + has_labeling: Does the adata has labelling? detected automatically + has_protein: Does the adata has protein information? detected automatically + use_smoothed: Whether to use smoothed data (or first moment, done via local average of neighbor cells) + NTR_vel: Whether to estimate NTR velocity + log_unnormalized: Whether to log transform unnormalized data. + """ def __init__(self, dynamics_kwargs: Dict): self.adata = dynamics_kwargs["adata"] self.filter_gene_mode = dynamics_kwargs["filter_gene_mode"] @@ -565,6 +774,8 @@ def estimate(self): class SplicedDynamics(BaseDynamics): + """Dynamics models for RNA data only contain spliced RNA. This includes the conventional, generalized moments method + (GMM) and negative binomial (NB) distribution method.""" def _calculate_velocity( self, vel: Velocity, @@ -573,6 +784,7 @@ def _calculate_velocity( N: Union[ndarray, csr_matrix], T: Union[ndarray, csr_matrix], ) -> Tuple: + """Implement the velocity calculation function for splicing data. Calculate unspliced and spliced velocity.""" vel_U = vel.vel_u(U) vel_S = vel.vel_s(U, S) vel_N = np.nan @@ -581,6 +793,7 @@ def _calculate_velocity( class LabeledDynamics(BaseDynamics): + """Dynamics model for metabolic labeling data.""" def _calculate_vel_U( self, vel: Velocity, @@ -589,6 +802,7 @@ def _calculate_vel_U( N: Union[ndarray, csr_matrix], T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: + """Calculate unspliced velocity. All subclass should implement this method.""" raise NotImplementedError("This method has not been implemented.") def _calculate_vel_S( @@ -599,6 +813,7 @@ def _calculate_vel_S( N: Union[ndarray, csr_matrix], T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: + """Calculate spliced velocity. All subclass should implement this method.""" raise NotImplementedError("This method has not been implemented.") def _calculate_vel_N( @@ -609,6 +824,7 @@ def _calculate_vel_N( N: Union[ndarray, csr_matrix], T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: + """Calculate new velocity. All subclass should implement this method.""" raise NotImplementedError("This method has not been implemented.") def _calculate_vel_T( @@ -619,6 +835,7 @@ def _calculate_vel_T( N: Union[ndarray, csr_matrix], T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: + """Calculate total velocity. All subclass should implement this method.""" raise NotImplementedError("This method has not been implemented.") def _calculate_velocity( @@ -629,6 +846,8 @@ def _calculate_velocity( N: Union[ndarray, csr_matrix], T: Union[ndarray, csr_matrix], ) -> Tuple: + """Implement the velocity calculation function for metabolic labeling data. Unsplcied and spliced velocity will + be nan for data without splicing information.""" if self.has_splicing: vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) @@ -640,6 +859,7 @@ def _calculate_velocity( class OneShotDynamics(LabeledDynamics): + """Dynamics model for the one shot experiment, where there is only one labeling time point.""" def _calculate_vel_U( self, vel: Velocity, @@ -682,6 +902,8 @@ def _calculate_vel_T( class SSKineticsDynamics(LabeledDynamics): + """Two-step dynamics model for the Kinetic experiment with steady state assumption, which relies on two consecutive + linear regressions to estimate the degradation rate.""" def _calculate_vel_U( self, vel: Velocity, @@ -730,6 +952,7 @@ def _calculate_velocity( N: Union[ndarray, csr_matrix], T: Union[ndarray, csr_matrix], ) -> Tuple: + """Override the velocity calculation function to calculate extra parameters slope and actual gamma.""" self.Kc = np.clip(self.gamma[:, None], 0, 1 - 1e-3) # S - U slope self.gamma_ = -(np.log(1 - self.Kc) / self.t[None, :]) # actual gamma if self.has_splicing: @@ -743,6 +966,8 @@ def _calculate_velocity( class KineticsDynamics(LabeledDynamics): + """Dynamic models for the kinetic experiment with kinetic assumption. This includes a kinetic two-step method and + the direct method.""" def _calculate_vel_U( self, vel: Velocity, @@ -791,6 +1016,7 @@ def _calculate_velocity( N: Union[ndarray, csr_matrix], T: Union[ndarray, csr_matrix], ) -> Tuple: + """Override the velocity calculation function to reset beta or alpha.""" if self.has_splicing: vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) @@ -805,6 +1031,9 @@ def _calculate_velocity( class DegradationDynamics(LabeledDynamics): + """Dynamics model for the degradation experiment. In degradation experiment, samples are chased after an extended + 4sU (or other nucleotide analog) labeling period and the wash-out to observe the decay of the abundance of the + (labeled) unspliced and spliced RNA decay over time.""" def _calculate_vel_U( self, vel: Velocity, @@ -847,6 +1076,7 @@ def _calculate_vel_T( class MixStdStmDynamics(LabeledDynamics): + """Dynamics model for the mixed steady state and stimulation labeling (mix_std_stm) experiment.""" def _calculate_vel_U( self, vel: Velocity, @@ -895,6 +1125,7 @@ def _calculate_velocity( N: Union[ndarray, csr_matrix], T: Union[ndarray, csr_matrix], ) -> Tuple: + """Override the velocity calculation function to calculate extra parameters u_new and alpha1.""" if self.has_splicing: u0, self.u_new, self.alpha1 = solve_alpha_2p_mat( t0=np.max(self.t) - self.t, @@ -920,6 +1151,7 @@ def _calculate_velocity( class MixKineticsDynamics(LabeledDynamics): + """Dynamics model for two mix experiment type: mix_kin_deg and mix_pulse_chase.""" def _calculate_vel_U( self, vel: Velocity, @@ -968,6 +1200,7 @@ def _calculate_velocity( N: Union[ndarray, csr_matrix], T: Union[ndarray, csr_matrix], ) -> Tuple: + """Override the velocity calculation function to reset beta when the data contains splicing information.""" if self.has_splicing: vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) @@ -1002,7 +1235,8 @@ def dynamics_wrapper( tkey: str = None, **est_kwargs, ) -> AnnData: - """Run corresponding Dynamics methods according to the parameters.""" + """Predict the model and assumption if they are set as auto. Run corresponding Dynamics methods according to the + experiment type. More information can be found in the class BaseDynamics.""" if "pp" not in adata.uns_keys(): raise ValueError(f"\nPlease run `dyn.pp.receipe_monocle(adata)` before running this function!") if model.lower() == "auto": From b6ae88efa52b759060456ec942d998c6ce1bd886 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 16 Jun 2023 12:29:59 -0400 Subject: [PATCH 14/31] add to tl init --- dynamo/tools/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dynamo/tools/__init__.py b/dynamo/tools/__init__.py index 1a4f7f776..f775a9ea3 100755 --- a/dynamo/tools/__init__.py +++ b/dynamo/tools/__init__.py @@ -40,7 +40,7 @@ # dimension reduction related from .dimension_reduction import reduceDimension # , run_umap -from .dynamics import dynamics +from .dynamics import dynamics, dynamics_wrapper # state graph related from .graph_calculus import GraphVectorField From 09843fa49db548a4f1e114b825f0f02b643414cd Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 16 Jun 2023 15:10:54 -0400 Subject: [PATCH 15/31] debug --- dynamo/tools/dynamics.py | 3 ++- dynamo/tools/utils.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 2b791aef8..fbf96d3c6 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -1339,7 +1339,8 @@ def dynamics_wrapper( estimator = MixKineticsDynamics(dynamics_kwargs) else: raise NotImplementedError("This method has not been implemented.") - estimator.estimate() + adata = estimator.estimate() + return adata # incorporate the model selection code soon diff --git a/dynamo/tools/utils.py b/dynamo/tools/utils.py index a5b1e6707..3ca06fb54 100755 --- a/dynamo/tools/utils.py +++ b/dynamo/tools/utils.py @@ -1172,7 +1172,7 @@ def get_auto_assump_mRNA( "ss" if NTR_vel else "kinetic", ) elif not has_splicing and "new" in subset_adata.layers.keys(): - assumption_mRNA = ("ss" if NTR_vel else "kinetic",) + assumption_mRNA = "ss" if NTR_vel else "kinetic" # splicing data if not has_labeling and ( ("X_unspliced" in subset_adata.layers.keys() and not use_moments) From 067dac7860b03ac488e043b7af2855769cd5858d Mon Sep 17 00:00:00 2001 From: QiangweiPeng Date: Mon, 19 Jun 2023 16:22:41 +0800 Subject: [PATCH 16/31] add subclass KineticsStormDynamics and upload storm.py to support Storm's three stochastic model --- dynamo/estimation/tsc/storm.py | 751 ++++++++++++++++++++++++++++++ dynamo/tools/dynamics.py | 822 +++++++++++++++++++++------------ 2 files changed, 1281 insertions(+), 292 deletions(-) create mode 100644 dynamo/estimation/tsc/storm.py diff --git a/dynamo/estimation/tsc/storm.py b/dynamo/estimation/tsc/storm.py new file mode 100644 index 000000000..8c038d8af --- /dev/null +++ b/dynamo/estimation/tsc/storm.py @@ -0,0 +1,751 @@ +from typing import Tuple, Union, Optional +from anndata import AnnData + +from scipy.sparse import ( + csr_matrix, + issparse, + SparseEfficiencyWarning, +) +import numpy as np +from matplotlib import pyplot as plt +from scipy.optimize import minimize +from tqdm import tqdm +from scipy.special import gammaln +from scipy.optimize import root, fsolve + +from dynamo.tools.utils import find_extreme + + +def mle_cell_specific_poisson_ss( + R: Union[np.ndarray, csr_matrix], + N: Union[np.ndarray, csr_matrix], + time: np.ndarray, + gamma_init: np.ndarray, + cell_total: np.ndarray, + Total_smoothed, + New_smoothed, +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """"Infer parameters based on the cell specific Poisson model using maximum likelihood estimation under the + steady-state assumption + + Args: + R: The number of total mRNA counts for each gene in each cell. shape: (n_var, n_obs). + N: The number of new mRNA counts for each gene in each cell. shape: (n_var, n_obs). + time: The time point of each cell. shape: (n_obs,). + gamma_init: The initial value of gamma. shape: (n_var,). + cell_total: The total counts of reads for each cell. shape: (n_obs,). + + Returns: + gamma: The estimated total mRNA degradation rate gamma. shape: (n_var,). + gamma_r2: The R2 of gamma. shape: (n_var,). + gamma_r2_raw: The R2 of gamma without correction. shape: (n_var,). + alpha: The estimated gene specific transcription rate alpha. shape: (n_var,). + + """ + n_var = N.shape[0] + n_obs = N.shape[1] + cell_capture_rate = cell_total / np.median(cell_total) + + # When there is only one labeling duration we can obtain the analytical solution directly but cannot define the + # goodness-of-fit. + if len(np.unique(time)) == 1: + gamma = np.zeros(n_var) + gamma_r2 = np.ones(n_var) # As goodness of fit could not be defined, all were set to 1. + gamma_r2_raw = np.ones(n_var) + alpha = np.zeros(n_var) + for i, r, n, r_smooth, n_smooth in tqdm( + zip(np.arange(n_var), R, N, Total_smoothed, New_smoothed), + "Infer parameters via maximum likelihood estimation based on the CSP model under the steady-state assumption" + ): + n = n.A.flatten() if issparse(n) else n.flatten() + r = r.A.flatten() if issparse(r) else r.flatten() + n_smooth = n_smooth.A.flatten() if issparse(n_smooth) else n_smooth.flatten() + r_smooth = r_smooth.A.flatten() if issparse(r_smooth) else r_smooth.flatten() + t_unique = np.unique(time) + mask = find_extreme(n_smooth, r_smooth, perc_left=None, perc_right=50) + gamma[i] = - np.log(1 - np.mean(n[mask]) / np.mean(r[mask])) / t_unique + alpha[i] = gamma[i]*np.mean(r[mask])/np.mean(cell_capture_rate[mask]) + else: + gamma = np.zeros(n_var) + gamma_r2 = np.zeros(n_var) + gamma_r2_raw = np.zeros(n_var) + alphadivgamma = np.zeros(n_var) + for i, r, n in tqdm( + zip(np.arange(n_var), R, N), + "Infer parameters via maximum likelihood estimation based on the CSP model under the steady-state assumption" + ): + n = n.A.flatten() if issparse(n) else n.flatten() + r = r.A.flatten() if issparse(r) else r.flatten() + + def loss_func_ss(parameters): + # Loss function of cell specific Poisson model under the steady-state assumption + parameter_alpha_div_gamma, parameter_gamma = parameters + mu_new = parameter_alpha_div_gamma * (1 - np.exp(-parameter_gamma * time)) * cell_capture_rate + loss_new = -np.sum(n * np.log(mu_new) - mu_new) + mu_total = parameter_alpha_div_gamma * cell_capture_rate + loss_total = -np.sum(r * np.log(mu_total) - mu_total) + loss = loss_new + loss_total + return loss + + # Initialize and add boundary conditions + alpha_div_gamma_init = np.mean(n) / np.mean(cell_capture_rate * (1 - np.exp(-gamma_init[i] * time))) + b1 = (0, 10 * alpha_div_gamma_init) + b2 = (0, 10 * gamma_init[i]) + bnds = (b1, b2) + parameters_init = np.array([alpha_div_gamma_init, gamma_init[i]]) + + # Solve + res = minimize(loss_func_ss, parameters_init, method='SLSQP', bounds=bnds, tol=1e-2, options={'maxiter': 1000}) + # res = minimize(loss_func_ss, parameters_init, method='Nelder-Mead', tol=1e-2, options={'maxiter': 1000}) + # res = minimize(loss_func_ss, parameters_init, method='COBYLA', bounds=bnds, tol=1e-2, options={'maxiter': 1000}) + parameters = res.x + loss = res.fun + success = res.success + alphadivgamma[i], gamma[i] = parameters + + # Calculate deviance R2 as goodness of fit + + def null_loss_func_ss(parameters_null): + # Loss function of null model under the steady-state assumption + parameters_a0_new, parameters_a0_total = parameters_null + mu_new = parameters_a0_new * cell_capture_rate + loss0_new = -np.sum(n * np.log(mu_new) - mu_new) + mu_total = parameters_a0_total * cell_capture_rate + loss0_total = -np.sum(r * np.log(mu_total) - mu_total) + loss0 = loss0_new + loss0_total + return loss0 + + def saturated_loss_func_ss(): + # Loss function of saturated model under the steady-state assumption + loss_saturated_new = -np.sum(n[n > 0] * np.log(n[n > 0]) - n[n > 0]) + loss_saturated_total = -np.sum(r[r > 0] * np.log(r[r > 0]) - r[r > 0]) + loss_saturated = loss_saturated_new + loss_saturated_total + return loss_saturated + + a0_new = np.mean(n) / np.mean(cell_capture_rate) + a0_total = np.mean(r) / np.mean(cell_capture_rate) + loss0 = null_loss_func_ss((a0_new, a0_total)) + + loss_saturated = saturated_loss_func_ss() + null_devanice = 2 * (loss0 - loss_saturated) + devanice = 2 * (loss - loss_saturated) + gamma_r2_raw[i] = 1 - (devanice / (2*n_obs - 2)) / (null_devanice / (2*n_obs - 2)) + + # Top 40% genes were selected by goodness of fit + gamma_r2 = gamma_r2_raw.copy() + number_selected_genes = int(n_var * 0.4) + gamma_r2[gamma < 0.01] = 0 + sort_index = np.argsort(-gamma_r2) + gamma_r2[sort_index[:number_selected_genes]] = 1 + gamma_r2[sort_index[number_selected_genes + 1:]] = 0 + + alpha = alphadivgamma*gamma + + return gamma, gamma_r2, gamma_r2_raw, alpha + + +def mle_cell_specific_poisson( + N: Union[np.ndarray, csr_matrix], + time: np.ndarray, + gamma_init: np.ndarray, + cell_total: np.ndarray +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """"Infer parameters based on cell specific Poisson distributions using maximum likelihood estimation + + Args: + N: The number of new mRNA counts for each gene in each cell. shape: (n_var, n_obs). + time: The time point of each cell. shape: (n_obs,). + gamma_init: The initial value of gamma. shape: (n_var,). + cell_total: The total counts of reads for each cell. shape: (n_obs,). + + Returns: + gamma: The estimated total mRNA degradation rate gamma. shape: (n_var,). + gamma_r2: The R2 of gamma. shape: (n_var,). + gamma_r2_raw: The R2 of gamma without correction. shape: (n_var,). + alpha: The estimated gene specific transcription rate alpha. shape: (n_var,). + """ + n_var = N.shape[0] + n_obs = N.shape[1] + gamma = np.zeros(n_var) + gamma_r2 = np.zeros(n_var) + gamma_r2_raw = np.zeros(n_var) + alphadivgamma = np.zeros(n_var) + for i, n in tqdm( + zip(np.arange(n_var), N), + "Infer parameters via maximum likelihood estimation based on the CSP model" + ): + n = n.A.flatten() if issparse(n) else n.flatten() + cell_capture_rate = cell_total / np.median(cell_total) + + def loss_func(parameters): + # Loss function of cell specific Poisson model + parameter_alpha_div_gamma, parameter_gamma = parameters + mu = parameter_alpha_div_gamma * (1 - np.exp(-parameter_gamma * time)) * cell_capture_rate + loss = -np.sum(n * np.log(mu) - mu) + return loss + + # Initialize and add boundary conditions + alpha_div_gamma_init = np.mean(n) / np.mean(cell_capture_rate * (1 - np.exp(-gamma_init[i] * time))) + b1 = (0, 10 * alpha_div_gamma_init) + b2 = (0, 10 * gamma_init[i]) + bnds = (b1, b2) + parameters_init = np.array([alpha_div_gamma_init, gamma_init[i]]) + + # Solve + res = minimize(loss_func, parameters_init, method='SLSQP', bounds=bnds, tol=1e-2, options={'maxiter': 1000}) + # res = minimize(loss_func, parameters_init, method='Nelder-Mead', tol=1e-2, options={'maxiter': 1000}) + # res = minimize(loss_func, parameters_init, method='COBYLA', bounds=bnds, tol=1e-2, options={'maxiter': 1000}) + parameters = res.x + loss = res.fun + success = res.success + alphadivgamma[i], gamma[i] = parameters + + # Calculate deviance R2 as goodness of fit + + def null_loss_func(parameters_null): + # Loss function of null model + parameters_a0 = parameters_null + mu = parameters_a0 * cell_capture_rate + loss0 = -np.sum(n * np.log(mu) - mu) + return loss0 + + def saturated_loss_func(): + # Loss function of saturated model + loss_saturated = -np.sum(n[n > 0] * np.log(n[n > 0]) - n[n > 0]) + return loss_saturated + + a0 = np.mean(n) / np.mean(cell_capture_rate) + loss0 = null_loss_func(a0) + + loss_saturated = saturated_loss_func() + null_devanice = 2 * (loss0 - loss_saturated) + devanice = 2 * (loss - loss_saturated) + gamma_r2_raw[i] = 1 - (devanice / (n_obs - 2)) / (null_devanice / (n_obs - 1)) + + # Top 40% genes were selected by goodness of fit + gamma_r2 = gamma_r2_raw.copy() + number_selected_genes = int(n_var * 0.4) + gamma_r2[gamma < 0.01] = 0 + sort_index = np.argsort(-gamma_r2) + gamma_r2[sort_index[:number_selected_genes]] = 1 + gamma_r2[sort_index[number_selected_genes + 1:]] = 0 + + return gamma, gamma_r2, gamma_r2_raw, alphadivgamma*gamma + + +def mle_cell_specific_zero_inflated_poisson( + N: Union[np.ndarray, csr_matrix], + time: np.ndarray, + gamma_init: np.ndarray, + cell_total: np.ndarray +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """"Infer parameters based on cell specific zero-inflated Poisson distributions using maximum likelihood estimation + + Args: + N: The number of new mRNA counts for each gene in each cell. shape: (n_var, n_obs). + time: The time point of each cell. shape: (n_obs,). + gamma_init: The initial value of gamma. shape: (n_var,). + cell_total: The total counts of reads for each cell. shape: (n_obs,). + + Returns: + gamma: The estimated total mRNA degradation rate gamma. shape: (n_var,). + prob_off: The estimated probability of gene expression being in the off state $p_{off}$. shape: (n_var,). + gamma_r2: The R2 of gamma. shape: (n_var,). + gamma_r2_raw: The R2 of gamma without correction. shape: (n_var,). + alpha: The estimated gene specific transcription rate alpha. shape: (n_var,). + """ + n_var = N.shape[0] + n_obs = N.shape[1] + gamma = np.zeros(n_var) + gamma_r2 = np.zeros(n_var) + gamma_r2_raw = np.zeros(n_var) + prob_off = np.zeros(n_var) + alphadivgamma = np.zeros(n_var) + + for i, n in tqdm( + zip(np.arange(n_var), N), + "Infer parameters via maximum likelihood estimation based on the CSZIP model" + ): + n = n.A.flatten() if issparse(n) else n.flatten() + cell_capture_rate = cell_total / np.median(cell_total) + + def loss_func(parameters): + # Loss function of cell specific zero-inflated Poisson model + parameter_alpha_div_gamma, parameter_gamma, parameter_prob_off = parameters + mu = parameter_alpha_div_gamma * (1 - np.exp(-parameter_gamma * time)) * cell_capture_rate + n_eq_0_index = n < 0.001 + n_over_0_index = n > 0.001 + loss_eq0 = -np.sum(np.log(parameter_prob_off + (1 - parameter_prob_off) * np.exp(-mu[n_eq_0_index]))) + loss_over0 = -np.sum(np.log(1 - parameter_prob_off) + (-mu[n_over_0_index]) + n[n_over_0_index] * np.log( + mu[n_over_0_index])) + loss = loss_eq0 + loss_over0 + return loss + + # Initialize and add boundary conditions + mean_n = np.mean(n) + s2_n = np.mean(np.power(n, 2)) + temp = np.mean(cell_capture_rate * (1 - np.exp(-gamma_init[i] * time))) + prob_off_init = 1 - mean_n * mean_n * np.mean( + np.power(cell_capture_rate * (1 - np.exp(-gamma_init[i] * time)), 2)) / ( + temp * temp * (s2_n - mean_n)) # Use moment estimation as the initial value of prob_off + alphadivgamma_init = mean_n / ((1 - prob_off_init) * temp) + b1 = (0, 10 * alphadivgamma_init) + b2 = (0, 10 * gamma_init[i]) + b3 = (0, (np.sum(n < 0.001) / np.sum(n > -1))) + bnds = (b1, b2, b3) + parameters_init = np.array([alphadivgamma_init, gamma_init[i], prob_off_init]) + + # Slove + res = minimize(loss_func, parameters_init, method='SLSQP', bounds=bnds, tol=1e-2, options={'maxiter': 1000}) + # res = minimize(loss_func, parameters_init, method='Nelder-Mead', tol=1e-2, options={'maxiter': 1000}) + # res = minimize(loss_func, parameters_init, method='COBYLA', bounds=bnds, tol=1e-2, options={'maxiter': 1000}) + parameters = res.x + alphadivgamma[i], gamma[i], prob_off[i] = parameters + loss = res.fun + success = res.success + + # Calculate deviance R2 as goodness of fit + + def null_Loss_func(parameters_null): + # Loss function of null model + parameters_null_lambda, parameters_null_prob_off = parameters_null + mu = parameters_null_lambda * cell_capture_rate + n_eq_0_index = n < 0.0001 + n_over_0_index = n > 0.0001 + null_loss_eq0 = -np.sum( + np.log(parameters_null_prob_off + (1 - parameters_null_prob_off) * np.exp(-mu[n_eq_0_index]))) + null_loss_over0 = -np.sum( + np.log(1 - parameters_null_prob_off) + (-mu[n_over_0_index]) + n[n_over_0_index] * np.log( + mu[n_over_0_index])) + null_loss = null_loss_eq0 + null_loss_over0 + return null_loss + + mean_cell_capture_rate = np.mean(cell_capture_rate) + prob_off_init_null = 1 - mean_n * mean_n * np.mean(np.power(cell_capture_rate, 2)) / ( + mean_cell_capture_rate * mean_cell_capture_rate * (s2_n - mean_n)) + lambda_init_null = mean_n / ((1 - prob_off_init_null) * mean_cell_capture_rate) + b1_null = (0, 10 * lambda_init_null) + b2_null = (0, (np.sum(n < 0.001) / np.sum(n > -1))) + bnds_null = (b1_null, b2_null) + parameters_init_null = np.array([lambda_init_null, prob_off_init_null]) + res_null = minimize(null_Loss_func, parameters_init_null, method='SLSQP', bounds=bnds_null, tol=1e-2, + options={'maxiter': 1000}) + loss0 = res_null.fun + + def saturated_loss_func(): + loss_saturated = -np.sum(n[n > 0] * np.log(n[n > 0]) - n[n > 0]) + return loss_saturated + + loss_saturated = saturated_loss_func() + null_devanice = 2 * (loss0 - loss_saturated) + devanice = 2 * (loss - loss_saturated) + + gamma_r2_raw[i] = 1 - (devanice / (n_obs - 2)) / (null_devanice / (n_obs - 1)) + + # Top 40% genes were selected by goodness of fit + gamma_r2 = gamma_r2_raw.copy() + number_selected_genes = int(n_var * 0.4) + gamma_r2[gamma < 0.01] = 0 + sort_index = np.argsort(-gamma_r2) + gamma_r2[sort_index[:number_selected_genes]] = 1 + gamma_r2[sort_index[number_selected_genes + 1:]] = 0 + + return gamma, prob_off, gamma_r2, gamma_r2_raw, gamma*alphadivgamma + + +def mle_independent_cell_specific_poisson( + UL: Union[np.ndarray, csr_matrix], + SL: Union[np.ndarray, csr_matrix], + time: np.ndarray, + gamma_init: np.ndarray, + beta_init: np.ndarray, + cell_total: np.ndarray, + Total_smoothed: Union[np.ndarray, csr_matrix], + S_smoothed: Union[np.ndarray, csr_matrix] +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + """"Infer parameters based on independent cell specific Poisson distributions using maximum likelihood estimation + + Args: + UL: The number of unspliced labeled mRNA counts for each gene in each cell. shape: (n_var, n_obs). + SL: The number of spliced labeled mRNA counts for each gene in each cell. shape: (n_var, n_obs). + time: The time point of each cell. shape: (n_obs,). + gamma_init: The initial value of gamma. shape: (n_var,). + beta_init: The initial value of beta. shape: (n_var,). + cell_total: The total counts of reads for each cell. shape: (n_obs,). + Total_smoothed: The number of total mRNA expression after normalization and smoothing for each gene in each cell. shape: (n_var, n_obs). + S_smoothed: The number of spliced mRNA expression after normalization and smoothing for each gene in each cell. shape: (n_var, n_obs). + + Returns: + gamma_s: The estimated spliced mRNA degradation rate gamma_s. shape: (n_var,). + gamma_r2: The R2 of gamma. shape: (n_var,). + beta: The estimated gene specific splicing rate beta. shape: (n_var,). + gamma_t: The estimated total mRNA degradation rate gamma_t. shape: (n_var,). + gamma_r2_raw: The R2 of gamma without correction. shape: (n_var,). + alpha: The estimated gene specific transcription rate alpha. shape: (n_var,). + """ + n_var = UL.shape[0] + n_obs = UL.shape[1] + gamma_s = np.zeros(n_var) + gamma_r2 = np.zeros(n_var) + gamma_r2_raw = np.zeros(n_var) + beta = np.zeros(n_var) + alpha = np.zeros(n_var) + gamma_t = np.zeros(n_var) + + for i, ul, sl, r, s in tqdm( + zip(np.arange(n_var), UL, SL, Total_smoothed, S_smoothed), + "Estimate gamma via maximum likelihood estimation based on the ICSP model " + ): + sl = sl.A.flatten() if issparse(sl) else sl.flatten() + ul = ul.A.flatten() if issparse(ul) else ul.flatten() + r = r.A.flatten() if issparse(r) else r.flatten() + s = s.A.flatten() if issparse(s) else s.flatten() + + cell_capture_rate = cell_total / np.median(cell_total) + + def loss_func(parameters): + # Loss function of independent cell specific Poisson model + parameter_alpha, parameter_beta, parameter_gamma_s = parameters + mu_u = parameter_alpha / parameter_beta * (1 - np.exp(-parameter_beta * time)) * cell_capture_rate + mu_s = (parameter_alpha / parameter_gamma_s * (1 - np.exp(-parameter_gamma_s * time)) + parameter_alpha / + (parameter_gamma_s - parameter_beta) * (np.exp(-parameter_gamma_s * time) - np.exp( + -parameter_beta * time))) * cell_capture_rate + loss_u = -np.sum(ul * np.log(mu_u) - mu_u) + loss_s = -np.sum(sl * np.log(mu_s) - mu_s) + loss = loss_u + loss_s + return loss + + # The initial values of gamma_s, beta and alpha are obtained from the initial values of gamma_t. + gamma_s_init = gamma_init[i] * np.sum(r * s) / np.sum(np.power(s, 2)) + beta_init_new = beta_init[i] * gamma_s_init / gamma_init[i] + alpha_init = np.mean(ul + sl) / np.mean(cell_capture_rate * ( + (1 - np.exp(-beta_init_new * time)) / beta_init_new + (1 - np.exp(-gamma_s_init * time)) / gamma_s_init + + (np.exp(-gamma_s_init * time) - np.exp(-beta_init_new * time)) / (gamma_s_init - beta_init_new))) + + # Initialize and add boundary conditions + b1 = (0, 10 * alpha_init) + b2 = (0, 10 * beta_init_new) + b3 = (0, 10 * gamma_s_init) + bnds = (b1, b2, b3) + parameters_init = np.array([alpha_init, beta_init_new, gamma_s_init]) + + # Solve + res = minimize(loss_func, parameters_init, method='SLSQP', bounds=bnds, tol=1e-2, options={'maxiter': 1000}) + # res = minimize(loss_func, parameters_init, method='Nelder-Mead', tol=1e-2, options={'maxiter': 1000}) + # res = minimize(loss_func, parameters_init, method='COBYLA', bounds=bnds, tol=1e-2, options={'maxiter': 1000}) + parameters = res.x + loss = res.fun + success = res.success + alpha[i], beta[i], gamma_s[i] = parameters + + # Calculate deviance R2 as goodness of fit + + def null_loss_func(parameters_null): + # Loss function of null model + parameters_a0, parameters_b0 = parameters_null + mu_u = parameters_a0 * cell_capture_rate + mu_s = parameters_b0 * cell_capture_rate + loss0_u = -np.sum(ul * np.log(mu_u) - mu_u) + loss0_s = -np.sum(sl * np.log(mu_s) - mu_s) + loss0 = loss0_u + loss0_s + return loss0 + + b0 = np.mean(ul) / np.mean(cell_capture_rate) + c0 = np.mean(sl) / np.mean(cell_capture_rate) + loss0 = null_loss_func((b0, c0)) + + def saturated_loss_func(): + # Loss function of saturated model + loss_saturated_u = -np.sum(ul[ul > 0] * np.log(ul[ul > 0]) - ul[ul > 0]) + loss_saturated_s = -np.sum(sl[sl > 0] * np.log(sl[sl > 0]) - sl[sl > 0]) + loss_saturated = loss_saturated_u + loss_saturated_s + return loss_saturated + + loss_saturated = saturated_loss_func() + null_devanice = 2 * (loss0 - loss_saturated) + devanice = 2 * (loss - loss_saturated) + gamma_r2_raw[i] = 1 - (devanice / (2 * n_obs - 3)) / (null_devanice / (2 * n_obs - 2)) # + 0.82 + + gamma_t[i] = gamma_s[i] * np.sum(np.power(s, 2)) / np.sum(r * s) + + # Top 40% genes were selected by goodness of fit + gamma_r2 = gamma_r2_raw.copy() + number_selected_genes = int(n_var * 0.4) + gamma_r2[gamma_s < 0.01] = 0 + sort_index = np.argsort(-gamma_r2) + gamma_r2[sort_index[:number_selected_genes]] = 1 + gamma_r2[sort_index[number_selected_genes + 1:]] = 0 + + return gamma_s, gamma_r2, beta, gamma_t, gamma_r2_raw, alpha + + +def cell_specific_alpha_beta( + UL_smoothed_CSP: Union[np.ndarray, csr_matrix], + SL_smoothed_CSP: Union[np.ndarray, csr_matrix], + time: np.ndarray, + gamma_init: np.ndarray, + beta_init: np.ndarray +) -> Tuple[csr_matrix, csr_matrix]: + """"Infer cell specific transcription rate and splicing rate based on ICSP model + + Args: + UL_smoothed_CSP: The number of unspliced labeled mRNA expression after smoothing based on CSP type model for + each gene in each cell. shape: (n_var, n_obs). + SL_smoothed_CSP: The number of spliced labeled mRNA expression after smoothing based on CSP type model for + each gene in each cell. shape: (n_var, n_obs). + time: The time point of each cell. shape: (n_obs,). + gamma_init: The gene wise initial value of gamma. shape: (n_var,). + beta_init: The gene wise initial value of beta. shape: (n_var,). + + Returns: alpha_cs, beta_cs + alpha_cs: The transcription rate for each gene in each cell. shape: (n_var, n_obs). + beta_cs: The splicing rate for each gene in each cell. shape: (n_var, n_obs). + """ + beta_cs = np.zeros_like(UL_smoothed_CSP.A) if issparse(UL_smoothed_CSP) else np.zeros_like(UL_smoothed_CSP) + + n_var = UL_smoothed_CSP.shape[0] + n_obs = UL_smoothed_CSP.shape[1] + + for i, ul, sl, gamma_i, beta_i in tqdm( + zip(np.arange(n_var), UL_smoothed_CSP, SL_smoothed_CSP, gamma_init, beta_init), + "Estimate cell specific alpha and beta" + ): + sl = sl.A.flatten() if issparse(sl) else sl.flatten() + ul = ul.A.flatten() if issparse(ul) else ul.flatten() + + for j in range(n_obs): + sl_j = sl[j] + ul_j = ul[j] + sl_div_ul_j = sl_j / ul_j + time_j = time[j] + + def solve_beta_func(beta_j): + # Equation for solving cell specific beta + return sl_div_ul_j - (1 - np.exp(-gamma_i * time_j)) / gamma_i * beta_j / (1 - np.exp(-beta_j * time_j)) \ + - beta_j / (gamma_i - beta_j) * (np.exp(-gamma_i * time_j) - np.exp(-beta_j * time_j)) / \ + (1 - np.exp(-beta_j * time_j)) + + beta_j_solve = root(solve_beta_func, beta_i) + # beta_j_solve = fsolve(solve_beta_func, beta_i) + + beta_cs[i, j] = beta_j_solve.x + + k = 1 - np.exp(-beta_cs * (np.tile(time, (n_var, 1)))) + beta_cs = csr_matrix(beta_cs) + alpha_cs = beta_cs.multiply(UL_smoothed_CSP).multiply(1 / k) + return alpha_cs, beta_cs + + +def visualize_CSP_loss_landscape( + adata: AnnData, + gene_name_list: list, + figsize: tuple = (3, 3), + dpi: int = 75, + save_name: Optional[str] = None): + """"Draw the landscape of CSP model-based loss function for the given genes. + + Args: + adata: class:`~anndata.AnnData` + an Annodata object + gene_name_list: A list of gene names that are going to be visualized. + figsize: The width and height of each panel in the figure. + dpi: The dot per inch of the figure. + save_name: The save path for visualization results. save_name = None means that only show but not save the + results. + + Returns: + ------- + A matplotlib plot that shows the landscape of CSP model-based loss function for the given genes. + """ + + def _traverse_CSP(n, time, gamma_init, cell_total): + """Traverse the CSP loss function to draw the landscape""" + n = n.A.flatten() if issparse(n) else n.flatten() + cell_capture_rate = cell_total / np.median(cell_total) + + def loss_func(parameters): + # Loss function of cell specific Poisson model + parameter_alpha_div_gamma, parameter_gamma = parameters + mu = parameter_alpha_div_gamma * (1 - np.exp(-parameter_gamma * time)) * cell_capture_rate + loss = -np.sum(n * np.log(mu) - mu - gammaln(n + 1)) + return loss + + def dldalpha_eq0(gamma): + # Analytic solution to the equation that the derivative of the loss with respect to alpha is equal to 0 + alpha_div_gamma_dldalpha_eq0 = np.mean(n) / np.mean(cell_capture_rate * (1 - np.exp(-gamma * time))) + return alpha_div_gamma_dldalpha_eq0 + + def alpha_constant(gamma): + # When gamma is sufficiently small, alpha is approximated as a constant. + alpha_div_gamma_constant = np.mean(n) / np.mean(cell_capture_rate * (gamma * time)) + return alpha_div_gamma_constant + + # Determine the scope of the traversal + alpha_div_gamma_init = np.mean(n / (1 - np.exp(-gamma_init * time))) + gamma_range = gamma_init * np.logspace(-2, 1, base=5, num=200) + alpha_div_gamma_range = alpha_div_gamma_init * np.logspace(-2, 1, base=5, num=200) + + # Iterate over the value of the loss function in the given range + loss_all = np.zeros((len(gamma_range), len(alpha_div_gamma_range))) + for s in range(len(gamma_range)): + for t in range(len(alpha_div_gamma_range)): + gamma_temp = gamma_range[s] + alpha_div_gamma_temp = alpha_div_gamma_range[t] + loss_all[s, t] = loss_func((alpha_div_gamma_temp, gamma_temp)) + + # Create grid data for drawing + X, Y = np.meshgrid(gamma_range, alpha_div_gamma_range) + Z = np.transpose(loss_all) + + # Calculate the loss value where dl/dalpha is equal to 0 and alpha is equal to a constant + alpha_div_gamma_dldalpha_eq0_range = np.zeros_like(gamma_range) + alpha_div_gamma_constant_range = np.zeros_like(gamma_range) + loss_dldalpha_eq0_range = np.zeros_like(gamma_range) + loss_constant_range = np.zeros_like(gamma_range) + for s in range(len(gamma_range)): + alpha_div_gamma_dldalpha_eq0_range[s] = dldalpha_eq0(gamma_range[s]) + alpha_div_gamma_constant_range[s] = alpha_constant(gamma_range[s]) + loss_dldalpha_eq0_range[s] = loss_func((alpha_div_gamma_dldalpha_eq0_range[s], gamma_range[s])) + loss_constant_range[s] = loss_func((alpha_div_gamma_constant_range[s], gamma_range[s])) + + return X, Y, Z, gamma_range, alpha_div_gamma_dldalpha_eq0_range, \ + alpha_div_gamma_constant_range, loss_dldalpha_eq0_range, loss_constant_range + + def _plot_landscape(X, Y, Z, gamma, alpha_div_gamma_dldalpha_eq0, alpha_div_gamma_constant, + loss_dldalpha_eq0, loss_constant, figsize, dpi, gene_name, save_name): + """Function to draw the landscape, dl/d$\alpha$ and $\alpha_cons$.""" + + # Adjust the range of the parameter to make the results clearer + index1 = np.where(np.logical_and(gamma > np.min(X), gamma < np.max(X))) + index2_dldgeq0 = np.where( + np.logical_and(alpha_div_gamma_dldalpha_eq0 > np.min(Y), alpha_div_gamma_dldalpha_eq0 < np.max(Y))) + index_dldgeq0 = np.intersect1d(index1, index2_dldgeq0) + index2_constant = np.where( + np.logical_and(alpha_div_gamma_constant > np.min(Y), alpha_div_gamma_constant < np.max(Y))) + index_constant = np.intersect1d(index1, index2_constant) + + # Create figure + fig = plt.figure(figsize=figsize, dpi=dpi) + ax = fig.add_subplot(111, projection='3d') + plt.tick_params(pad=-2) + + # Create plot + surf = ax.plot_surface(X, Y, Z, cmap='rainbow', rstride=1, cstride=1, alpha=0.75) + ax.plot(gamma[index_dldgeq0], alpha_div_gamma_dldalpha_eq0[index_dldgeq0], loss_dldalpha_eq0[index_dldgeq0], + color='black', + linewidth=1, label='$\\frac{\partial \ell}{\partial \\alpha}(\\alpha, \gamma_{t})=0$') + ax.plot(gamma[index_constant], alpha_div_gamma_constant[index_constant], loss_constant[index_constant], + color='red', + linewidth=1, label='$\\alpha=\\alpha_{cons}$') + plt.legend() + + cax = fig.add_axes([0.005, 0.15, 0.025, 0.75]) # left down right up + fig.colorbar(surf, ax=ax, shrink=0.5, aspect=5, cax=cax) + + # Add labels + ax.set_xlabel('$\gamma_{t}$', labelpad=-7) + ax.set_ylabel('$\\alpha/\gamma_{t}$', labelpad=-7) + ax.set_zlabel('$-\ell(\\alpha,\gamma_{t})$', labelpad=-7) + ax.set_zlim(np.min(Z), np.max(Z)) + ax.set_title(f'Loss function landscape of for gene {gene_name}') + ax.zaxis.get_major_formatter().set_powerlimits((0, 1)) + + # ax.view_init(azim=-50) + fig.tight_layout() + plt.grid(False) + if save_name: + plt.savefig(save_name) + plt.show() + + sub_adata = adata[:, gene_name_list] + cell_total = sub_adata.obs['initial_cell_size'].astype("float").values + time = sub_adata.obs['time'] + N = sub_adata.layers['new'].T + gamma_init = sub_adata.var['gamma'] + n_var = len(gene_name_list) + for i, n, gene, gamma_init_i in tqdm( + zip(np.arange(n_var), N, gene_name_list, gamma_init), + 'Visualize the landscape of the CSP model loss function' + ): + X, Y, Z, gamma, alpha_div_gamma_dldalpha_eq0, alpha_div_gamma_constant, loss_dldalpha_eq0, loss_constant = \ + _traverse_CSP(n, time, gamma_init_i, cell_total) + _plot_landscape(X, Y, Z, gamma, alpha_div_gamma_dldalpha_eq0, alpha_div_gamma_constant, loss_dldalpha_eq0, + loss_constant, figsize, dpi, gene, save_name) + + +def robustness_measure_CSP( + adata: AnnData, + gene_name_list: list, +) -> np.ndarray: + """Calculate the robustness measure based on CSP model inference of the given genes + + Args: + adata: class:`~anndata.AnnData` + an Annodata object + gene_name_list: A list of gene names that are going to be calculated robustness measure based on CSP model. + + Returns: + robustness_measure: The robustness measure based on CSP model inference of the given genes. + shape: (len(gene_name_list),). + """ + sub_adata = adata[:, gene_name_list] + cell_total = sub_adata.obs['initial_cell_size'].astype("float").values + time = sub_adata.obs['time'] + N = sub_adata.layers['new'].T + robustness_measure = calculate_robustness_measure_CSP(N, time, cell_total) + return robustness_measure + + +def calculate_robustness_measure_CSP( + N: Union[np.ndarray, csr_matrix], + time: np.ndarray, + cell_total: np.ndarray +) -> np.ndarray: + """Calculate the robustness measure based on CSP model inference + + Args: + N: The number of new mRNA counts for each gene in each cell. shape: (n_var, n_obs). + time: The time point of each cell. shape: (n_obs,). + cell_total: The total counts of reads for each cell. shape: (n_obs,). + + Returns: + robustness_measure: The robustness measure based on CSP model inference for each gene. shape: (n_var,). + """ + n_var = N.shape[0] + robustness_measure = np.zeros(n_var) + for i, n in tqdm( + zip(np.arange(n_var), N), + "Calculate the robustness measure" + ): + n = n.A.flatten() if issparse(n) else n.flatten() + cell_capture_rate = cell_total / np.median(cell_total) + + def partial_loss_partial_gamma(parameters): + # Partial derivative of loss with respect to gamma. + parameter_gamma = parameters + optimal_alphadivgamma = np.mean(n) / np.mean(cell_capture_rate * (1 - np.exp(-parameter_gamma * time))) + pLoss_pgamma = np.sum(-n * time * np.exp(-parameter_gamma * time) / (1 - np.exp( + -parameter_gamma * time)) + cell_capture_rate * optimal_alphadivgamma * time * np.exp( + -parameter_gamma * time)) + return pLoss_pgamma + + def loss_func(parameters): + # Loss function of cell specific Poisson model + parameter_alpha_div_gamma, parameter_gamma = parameters + mu = parameter_alpha_div_gamma * (1 - np.exp(-parameter_gamma * time)) * cell_capture_rate + loss = -np.sum(n * np.log(mu) - mu - gammaln(n + 1)) + return loss + + gamma_range = np.arange(0.01, 1.51, 0.01) + loss = np.zeros_like(gamma_range) + p_loss_p_gamma = np.zeros_like(gamma_range) + for s in range(len(gamma_range)): + gamma_temp = gamma_range[s] + alpha_div_gamma_temp = np.mean(n) / np.mean(cell_capture_rate * (1 - np.exp(-gamma_temp * time))) + p_loss_p_gamma[s] = partial_loss_partial_gamma(gamma_temp) + loss[s] = loss_func((gamma_temp, alpha_div_gamma_temp)) + + # robust_measure[i] = np.mean(np.abs(p_loss_p_gamma)) + robustness_measure[i] = np.sum(np.abs(loss[1:] - loss[0:-1])) + + return robustness_measure diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 2b791aef8..74591b4b6 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -30,6 +30,7 @@ from ..estimation.tsc.estimation_kinetic import * from ..estimation.tsc.twostep import fit_slope_stochastic, lin_reg_gamma_synthesis from ..estimation.tsc.utils_kinetic import * +from ..estimation.tsc import storm from .moments import ( moments, prepare_data_deterministic, @@ -266,6 +267,7 @@ class BaseDynamics: NTR_vel: Whether to estimate NTR velocity log_unnormalized: Whether to log transform unnormalized data. """ + def __init__(self, dynamics_kwargs: Dict): self.adata = dynamics_kwargs["adata"] self.filter_gene_mode = dynamics_kwargs["filter_gene_mode"] @@ -402,7 +404,9 @@ def _estimate_params_kin(self, cur_grp_i: int, cur_grp: str, subset_adata: AnnDa params.loc[:, "gamma"].values if "gamma" in params.columns else None, ) if self.alpha is None: - self.alpha = fbar(self.a, self.b, self.alpha_a, 0) if self.alpha_i is None else fbar(self.a, self.b, self.alpha_a, self.alpha_i) + self.alpha = fbar(self.a, self.b, self.alpha_a, 0) if self.alpha_i is None else fbar(self.a, self.b, + self.alpha_a, + self.alpha_i) all_kinetic_params = [ "a", "b", @@ -421,22 +425,23 @@ def estimate_parameters(self, cur_grp_i: int, cur_grp: str, subset_adata: AnnDat if self.assumption_mRNA.lower() == "ss" or (self.experiment_type.lower() in ["one-shot", "mix_std_stm"]): self._estimate_params_ss(subset_adata=subset_adata, **est_params_args) elif self.assumption_mRNA.lower() == "kinetic": - self._estimate_params_kin(cur_grp_i=cur_grp_i, cur_grp=cur_grp, subset_adata=subset_adata, **est_params_args) + self._estimate_params_kin(cur_grp_i=cur_grp_i, cur_grp=cur_grp, subset_adata=subset_adata, + **est_params_args) else: main_warning("Not implemented yet.") def set_velocity( - self, - vel_U: Union[ndarray, csr_matrix], - vel_S: Union[ndarray, csr_matrix], - vel_N: Union[ndarray, csr_matrix], - vel_T: Union[ndarray, csr_matrix], - vel_P: Union[ndarray, csr_matrix], - cur_grp: int, - cur_cells_bools: ndarray, - valid_bools_: ndarray, - kin_param_pre: str, - **set_velo_args, + self, + vel_U: Union[ndarray, csr_matrix], + vel_S: Union[ndarray, csr_matrix], + vel_N: Union[ndarray, csr_matrix], + vel_T: Union[ndarray, csr_matrix], + vel_P: Union[ndarray, csr_matrix], + cur_grp: int, + cur_cells_bools: ndarray, + valid_bools_: ndarray, + kin_param_pre: str, + **set_velo_args, ): """Save the calculated parameters and velocity to anndata. Override this in the subclass if the class has a different assumption.""" @@ -507,12 +512,12 @@ def set_velocity( main_warning("Not implemented yet.") def _calculate_velocity( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Tuple: """The core function to calculate the RNA velocity. Every subclass needs to implement this function. @@ -529,12 +534,12 @@ def _calculate_velocity( raise NotImplementedError("This method has not been implemented.") def _calculate_vel_P( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: """Calculate the protein velocity.""" return vel.vel_p(T, self.P) if self.NTR_vel else vel.vel_p(S, self.P) @@ -615,12 +620,12 @@ def _smooth(self, valid_bools: ndarray): ) def _sanity_check( - self, - valid_bools: ndarray, - valid_bools_: ndarray, - gene_num: int, - subset_adata: AnnData, - kin_param_pre: str, + self, + valid_bools: ndarray, + valid_bools_: ndarray, + gene_num: int, + subset_adata: AnnData, + kin_param_pre: str, ) -> Tuple: """Perform sanity check by checking the slope for kinetic or degradation metabolic labeling experiments.""" indices_valid_bools = np.where(valid_bools)[0] @@ -776,13 +781,14 @@ def estimate(self): class SplicedDynamics(BaseDynamics): """Dynamics models for RNA data only contain spliced RNA. This includes the conventional, generalized moments method (GMM) and negative binomial (NB) distribution method.""" + def _calculate_velocity( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Tuple: """Implement the velocity calculation function for splicing data. Calculate unspliced and spliced velocity.""" vel_U = vel.vel_u(U) @@ -794,57 +800,58 @@ def _calculate_velocity( class LabeledDynamics(BaseDynamics): """Dynamics model for metabolic labeling data.""" + def _calculate_vel_U( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: """Calculate unspliced velocity. All subclass should implement this method.""" raise NotImplementedError("This method has not been implemented.") def _calculate_vel_S( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: """Calculate spliced velocity. All subclass should implement this method.""" raise NotImplementedError("This method has not been implemented.") def _calculate_vel_N( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: """Calculate new velocity. All subclass should implement this method.""" raise NotImplementedError("This method has not been implemented.") def _calculate_vel_T( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: """Calculate total velocity. All subclass should implement this method.""" raise NotImplementedError("This method has not been implemented.") def _calculate_velocity( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Tuple: """Implement the velocity calculation function for metabolic labeling data. Unsplcied and spliced velocity will be nan for data without splicing information.""" @@ -860,43 +867,44 @@ def _calculate_velocity( class OneShotDynamics(LabeledDynamics): """Dynamics model for the one shot experiment, where there is only one labeling time point.""" + def _calculate_vel_U( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_u(U) def _calculate_vel_S( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) def _calculate_vel_N( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_u(N) def _calculate_vel_T( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_s(N, T - N) if self.has_splicing else vel.vel_u(T) @@ -904,53 +912,54 @@ def _calculate_vel_T( class SSKineticsDynamics(LabeledDynamics): """Two-step dynamics model for the Kinetic experiment with steady state assumption, which relies on two consecutive linear regressions to estimate the degradation rate.""" + def _calculate_vel_U( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return N.multiply(csr_matrix(self.gamma_ / self.Kc)) - csr_matrix(self.beta).multiply(U) def _calculate_vel_S( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) def _calculate_vel_N( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return (N - csr_matrix(self.Kc).multiply(N)).multiply(csr_matrix(self.gamma_ / self.Kc)) def _calculate_vel_T( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return (N - csr_matrix(self.Kc).multiply(T)).multiply(csr_matrix(self.gamma_ / self.Kc)) def _calculate_velocity( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Tuple: """Override the velocity calculation function to calculate extra parameters slope and actual gamma.""" self.Kc = np.clip(self.gamma[:, None], 0, 1 - 1e-3) # S - U slope @@ -968,53 +977,54 @@ def _calculate_velocity( class KineticsDynamics(LabeledDynamics): """Dynamic models for the kinetic experiment with kinetic assumption. This includes a kinetic two-step method and the direct method.""" + def _calculate_vel_U( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_u(U) def _calculate_vel_S( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) def _calculate_vel_N( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_u(N) def _calculate_vel_T( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_u(T) def _calculate_velocity( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Tuple: """Override the velocity calculation function to reset beta or alpha.""" if self.has_splicing: @@ -1030,100 +1040,174 @@ def _calculate_velocity( return vel_U, vel_S, vel_N, vel_T +class KineticsStormDynamics(LabeledDynamics): + """Stochastic transient dynamics for the kinetic experiment with kinetic assumption. This includes three stochastic + models. In Model 1, only transcription and mRNA degradation were considered. In Model 2, we considered + transcription, splicing, and spliced mRNA degradation. And in Model 3, we considered the switching of gene + expression states, transcription in the active state, and mRNA degradation.""" + + def _calculate_vel_U( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return vel.vel_u(U) + + def _calculate_vel_S( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return vel.vel_s(U, S) + + def _calculate_vel_N( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + if self.est_method == 'storm-icsp': + return vel.vel_u(self.Sl) + else: + return vel.vel_u(N) + + def _calculate_vel_T( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + if self.est_method == 'storm-icsp': + return vel.vel_u(S) + else: + return vel.vel_u(T) + + def _calculate_velocity( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Tuple: + """Override the velocity calculation function to reset beta or alpha.""" + if self.has_splicing: + vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + vel.parameters["beta"] = self.gamma + else: + vel_U, vel_S = np.nan, np.nan + vel_N = self._calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self._calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + return vel_U, vel_S, vel_N, vel_T + + class DegradationDynamics(LabeledDynamics): """Dynamics model for the degradation experiment. In degradation experiment, samples are chased after an extended 4sU (or other nucleotide analog) labeling period and the wash-out to observe the decay of the abundance of the (labeled) unspliced and spliced RNA decay over time.""" + def _calculate_vel_U( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return np.nan def _calculate_vel_S( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) def _calculate_vel_N( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return np.nan def _calculate_vel_T( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return np.nan class MixStdStmDynamics(LabeledDynamics): """Dynamics model for the mixed steady state and stimulation labeling (mix_std_stm) experiment.""" + def _calculate_vel_U( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return self.alpha1 - csr_matrix(self.beta[:, None]).multiply(U) def _calculate_vel_S( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) def _calculate_vel_N( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return self.alpha1 - csr_matrix(self.gamma[:, None]).multiply(self.u_new) def _calculate_vel_T( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return self.alpha1 - csr_matrix(self.gamma[:, None]).multiply(T) def _calculate_velocity( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Tuple: """Override the velocity calculation function to calculate extra parameters u_new and alpha1.""" if self.has_splicing: @@ -1152,53 +1236,54 @@ def _calculate_velocity( class MixKineticsDynamics(LabeledDynamics): """Dynamics model for two mix experiment type: mix_kin_deg and mix_pulse_chase.""" + def _calculate_vel_U( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_u(U, repeat=True) def _calculate_vel_S( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) def _calculate_vel_N( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_u(N, repeat=True) def _calculate_vel_T( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_u(T) if not self.has_splicing and self.NTR_vel else vel.vel_u(T, repeat=True) def _calculate_velocity( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Tuple: """Override the velocity calculation function to reset beta when the data contains splicing information.""" if self.has_splicing: @@ -1214,26 +1299,26 @@ def _calculate_velocity( # TODO: rename this later def dynamics_wrapper( - adata: AnnData, - filter_gene_mode: Literal["final", "basic", "no"] = "final", - use_smoothed: bool = True, - assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", - assumption_protein: Literal["ss"] = "ss", - model: Literal["auto", "deterministic", "stochastic"] = "auto", - est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", - NTR_vel: bool = False, - group: Optional[str] = None, - protein_names: Optional[List[str]] = None, - concat_data: bool = False, - log_unnormalized: bool = True, - one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", - fraction_for_deg: bool = False, - re_smooth: bool = False, - sanity_check: bool = False, - del_2nd_moments: Optional[bool] = None, - cores: int = 1, - tkey: str = None, - **est_kwargs, + adata: AnnData, + filter_gene_mode: Literal["final", "basic", "no"] = "final", + use_smoothed: bool = True, + assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", + assumption_protein: Literal["ss"] = "ss", + model: Literal["auto", "deterministic", "stochastic"] = "auto", + est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", + NTR_vel: bool = False, + group: Optional[str] = None, + protein_names: Optional[List[str]] = None, + concat_data: bool = False, + log_unnormalized: bool = True, + one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", + fraction_for_deg: bool = False, + re_smooth: bool = False, + sanity_check: bool = False, + del_2nd_moments: Optional[bool] = None, + cores: int = 1, + tkey: str = None, + **est_kwargs, ) -> AnnData: """Predict the model and assumption if they are set as auto. Run corresponding Dynamics methods according to the experiment type. More information can be found in the class BaseDynamics.""" @@ -1323,12 +1408,19 @@ def dynamics_wrapper( if experiment_type == "conventional": estimator = SplicedDynamics(dynamics_kwargs) elif experiment_type in ["one-shot", "one_shot"]: - estimator = OneShotDynamics(dynamics_kwargs) + if model == 'deterministic': + estimator = OneShotDynamics(dynamics_kwargs) + elif model == 'stochastic': + dynamics_kwargs['est_method'] = 'storm-csp' + estimator = OneShotDynamics(dynamics_kwargs) elif experiment_type == "kin": if assumption_mRNA == "ss": estimator = SSKineticsDynamics(dynamics_kwargs) elif assumption_mRNA == "kinetic": - estimator = KineticsDynamics(dynamics_kwargs) + if model == 'deterministic': + estimator = KineticsDynamics(dynamics_kwargs) + elif model == 'stochastic': + estimator = KineticsStormDynamics(dynamics_kwargs) else: raise NotImplementedError("This method has not been implemented.") elif experiment_type == "deg": @@ -1344,26 +1436,26 @@ def dynamics_wrapper( # incorporate the model selection code soon def dynamics( - adata: AnnData, - filter_gene_mode: Literal["final", "basic", "no"] = "final", - use_smoothed: bool = True, - assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", - assumption_protein: Literal["ss"] = "ss", - model: Literal["auto", "deterministic", "stochastic"] = "auto", - est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", - NTR_vel: bool = False, - group: Optional[str] = None, - protein_names: Optional[List[str]] = None, - concat_data: bool = False, - log_unnormalized: bool = True, - one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", - fraction_for_deg: bool = False, - re_smooth: bool = False, - sanity_check: bool = False, - del_2nd_moments: Optional[bool] = None, - cores: int = 1, - tkey: str = None, - **est_kwargs, + adata: AnnData, + filter_gene_mode: Literal["final", "basic", "no"] = "final", + use_smoothed: bool = True, + assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", + assumption_protein: Literal["ss"] = "ss", + model: Literal["auto", "deterministic", "stochastic"] = "auto", + est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", + NTR_vel: bool = False, + group: Optional[str] = None, + protein_names: Optional[List[str]] = None, + concat_data: bool = False, + log_unnormalized: bool = True, + one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", + fraction_for_deg: bool = False, + re_smooth: bool = False, + sanity_check: bool = False, + del_2nd_moments: Optional[bool] = None, + cores: int = 1, + tkey: str = None, + **est_kwargs, ) -> AnnData: """Inclusive model of expression dynamics considers splicing, metabolic labeling and protein translation. @@ -1706,8 +1798,8 @@ def dynamics( valid_gene_checker = np.zeros(gene_num, dtype=bool) for L_iter, cur_L in tqdm( - enumerate(L), - desc=f"sanity check of {experiment_type} experiment data:", + enumerate(L), + desc=f"sanity check of {experiment_type} experiment data:", ): cur_L = cur_L.A.flatten() if issparse(cur_L) else cur_L.flatten() y = strat_mom(cur_L, t, np.nanmean) @@ -2265,18 +2357,18 @@ def dynamics( def kinetic_model( - subset_adata: AnnData, - tkey: str, - model: Literal["auto", "deterministic", "stochastic"], - est_method: Literal["twostep", "direct"], - experiment_type: str, - has_splicing: bool, - splicing_labeling: bool, - has_switch: bool, - param_rngs: Dict[str, List[int]], - data_type: Literal["smoothed", "sfs"] = "sfs", - return_ntr: bool = False, - **est_kwargs, + subset_adata: AnnData, + tkey: str, + model: Literal["auto", "deterministic", "stochastic"], + est_method: Literal["twostep", "direct", "storm-csp", "storm-cszip", "storm-icsp"], + experiment_type: str, + has_splicing: bool, + splicing_labeling: bool, + has_switch: bool, + param_rngs: Dict[str, List[int]], + data_type: Literal["smoothed", "sfs"] = "sfs", + return_ntr: bool = False, + **est_kwargs, ) -> Tuple[ Union[Dict[str, Any], pd.DataFrame], np.ndarray, @@ -2447,6 +2539,152 @@ def kinetic_model( K_fit, ) + return ( + Estm_df, + half_life, + cost, + logLL, + _param_ranges, + X_data, + X_fit_data, + ) + elif "storm" in est_method: + if has_splicing: + # Initialization based on the steady-state assumption + layers_smoothed = ["M_u", "M_s", "M_t", "M_n"] + U_smoothed, S_smoothed, Total_smoothed, New_smoothed = ( + subset_adata.layers[layers_smoothed[0]].T, + subset_adata.layers[layers_smoothed[1]].T, + subset_adata.layers[layers_smoothed[2]].T, + subset_adata.layers[layers_smoothed[3]].T, + ) + + US_smoothed, S2_smoothed = ( + subset_adata.layers["M_us"].T, + subset_adata.layers["M_ss"].T, + ) + (gamma_k, _, _, _,) = fit_slope_stochastic(S_smoothed, U_smoothed, US_smoothed, S2_smoothed, + perc_left=None, perc_right=5) + (gamma_init, _, _, _, _) = lin_reg_gamma_synthesis(Total_smoothed, New_smoothed, time, perc_right=5) + beta_init = gamma_init / gamma_k # gamma_k = gamma / beta + + # Read raw counts + layers_raw = ["ul", "sl"] + UL_raw, SL_raw = ( + subset_adata.layers[layers_raw[0]].T, + subset_adata.layers[layers_raw[1]].T, + ) + + # Read smoothed values based CSP type distribution for cell-specific parameter inference + UL_smoothed_CSP, SL_smoothed_CSP = ( + subset_adata.layers['M_CSP_ul'].T, + subset_adata.layers['M_CSP_sl'].T, + ) + + # Parameters inference based on maximum likelihood estimation + cell_total = subset_adata.obs['initial_cell_size'].astype("float").values + # Independent cell-specific Poisson + (gamma_s, gamma_r2, beta, gamma_t, gamma_r2_raw, alpha) = storm.mle_independent_cell_specific_poisson \ + (UL_raw, SL_raw, time, gamma_init, beta_init, cell_total, Total_smoothed, S_smoothed) + gamma_k = gamma_s / beta + gamma_b = np.zeros_like(gamma_k) + + # Cell specific parameters (fixed gamma_s) + alpha, beta = storm.cell_specific_alpha_beta(UL_smoothed_CSP, SL_smoothed_CSP, time, gamma_s, beta) + + # # Cell specific parameters(fixed gamma_t) + # k = 1 - np.exp(-gamma_t[:, None] * time[None, :]) + # alpha = csr_matrix(gamma_t[:, None]).multiply(UL_smoothed_CSP+SL_smoothed_CSP).multiply(1 / k) + + Estm_df = { + "alpha": alpha, + "beta": beta, + "gamma_k": gamma_k, + "gamma_b": gamma_b, + # "gamma_k_r2": gamma_all_r2, + # "gamma_logLL": gamma_all_logLL, + "gamma": gamma_s, + "gamma_r2": gamma_r2, + # "mean_R2": mean_R2, + "gamma_t": gamma_t, + "gamma_r2_raw": gamma_r2_raw, + } + half_life = np.log(2) / gamma_s + cost, logLL, _param_ranges, X_data, X_fit_data = ( + None, + None, + None, + None, + None, + ) + + return ( + Estm_df, + half_life, + cost, + logLL, + _param_ranges, + X_data, + X_fit_data, + ) + else: + # Initialization based on the steady-state assumption + layers_smoothed = ["M_t", "M_n"] + Total_smoothed, New_smoothed = ( + subset_adata.layers[layers_smoothed[0]].T, + subset_adata.layers[layers_smoothed[1]].T, + ) + (gamma_init, _, _, _, _,) = lin_reg_gamma_synthesis(Total_smoothed, New_smoothed, time, + perc_right=5) + + # Read raw counts + layers_raw = ["total", "new"] + Total_raw, New_raw = ( + subset_adata.layers[layers_raw[0]].T, + subset_adata.layers[layers_raw[1]].T, + ) + + # Read smoothed values based CSP type distribution for cell-specific parameter inference + layers_smoothed_CSP = ["M_CSP_t", "M_CSP_n"] + Total_smoothed_CSP, New_smoothed_CSP = ( + subset_adata.layers[layers_smoothed_CSP[0]].T, + subset_adata.layers[layers_smoothed_CSP[1]].T, + ) + + # Parameters inference based on maximum likelihood estimation + cell_total = subset_adata.obs['initial_cell_size'].astype("float").values + + if "storm-csp" == est_method: + gamma, gamma_r2, gamma_r2_raw, alpha = storm.mle_cell_specific_poisson(New_raw, time, + gamma_init, cell_total) + elif "storm-cszip" == est_method: + gamma, prob_off, gamma_r2, gamma_r2_raw, alpha = storm.mle_cell_specific_zero_inflated_poisson( + New_raw, time, gamma_init, cell_total) + alpha = alpha * (1 - prob_off) # gene-wise alpha + else: + raise NotImplementedError("This method has not been implemented.") + + k = 1 - np.exp(-gamma[:, None] * time[None, :]) + alpha = csr_matrix(gamma[:, None]).multiply(New_smoothed_CSP).multiply(1 / k) # gene-cell-wise alpha + + Estm_df = { + "alpha": alpha, + "gamma": gamma, + "gamma_k": gamma, # required for phase_potrait + "gamma_r2": gamma_r2, + "gamma_r2_raw": gamma_r2_raw, + # "mean_R2": mean_R2, + "prob_off": prob_off if "cszip" in est_method else None + } + half_life = np.log(2) / gamma + cost, logLL, _param_ranges, X_data, X_fit_data = ( + None, + None, + None, + None, # X_data, + None, # K_fit, + ) + return ( Estm_df, half_life, @@ -2948,7 +3186,7 @@ def kinetic_model( X_data[i_gene] = cur_X_data if model.lower().startswith("mixture"): X_fit_data[i_gene] = estm.simulator.x.T - X_fit_data[i_gene][estm.model1.n_species :] *= estm.scale + X_fit_data[i_gene][estm.model1.n_species:] *= estm.scale elif experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: # kinetic chase simulation kinetic_chase = estm.simulator.x.T From 8d131332a123704bab6f09df54b18bc6e3a57fb0 Mon Sep 17 00:00:00 2001 From: QiangweiPeng Date: Mon, 19 Jun 2023 16:51:07 +0800 Subject: [PATCH 17/31] Modify the relevant code to support cell-specific beta --- dynamo/estimation/csc/velocity.py | 2 ++ dynamo/tools/dynamics.py | 13 +++++++++++-- dynamo/tools/utils.py | 13 ++++++++++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/dynamo/estimation/csc/velocity.py b/dynamo/estimation/csc/velocity.py index 8588836fa..6a6a0a52b 100755 --- a/dynamo/estimation/csc/velocity.py +++ b/dynamo/estimation/csc/velocity.py @@ -141,6 +141,8 @@ def vel_u(self, U, repeat=None, update_alpha=True): if self.parameters["beta"].ndim == 1: beta = np.repeat(self.parameters["beta"].reshape((-1, 1)), U.shape[1], axis=1) + elif self.parameters["beta"].shape[1] == U.shape[1]: # to support cell-wise beta + beta = self.parameters["beta"] elif self.parameters["beta"].shape[1] == len(t_uniq) and len(t_uniq) > 1: beta = np.zeros_like(U.shape) for i in range(len(t_uniq)): diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 74591b4b6..f86104ab6 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -376,9 +376,11 @@ def _estimate_params_kin(self, cur_grp_i: int, cur_grp: str, subset_adata: AnnDa if type(params) == dict: self.alpha = params.pop("alpha") + self.beta = params.pop("beta") if "beta" in params else None params = pd.DataFrame(params) else: self.alpha = params.loc[:, "alpha"].values if "alpha" in params.columns else None + self.beta = params.loc[:, "beta"].values if "beta" in params.columns else None len_t, len_g = len(np.unique(self.t)), len(self._group) if cur_grp == self._group[0]: @@ -395,12 +397,19 @@ def _estimate_params_kin(self, cur_grp_i: int, cur_grp: str, subset_adata: AnnDa cur_X_fit_data, ) - self.a, self.b, self.alpha_a, self.alpha_i, self.beta, self.gamma = ( + # self.a, self.b, self.alpha_a, self.alpha_i, self.beta, self.gamma = ( + # params.loc[:, "a"].values if "a" in params.columns else None, + # params.loc[:, "b"].values if "b" in params.columns else None, + # params.loc[:, "alpha_a"].values if "alpha_a" in params.columns else None, + # params.loc[:, "alpha_i"].values if "alpha_i" in params.columns else None, + # params.loc[:, "beta"].values if "beta" in params.columns else None, + # params.loc[:, "gamma"].values if "gamma" in params.columns else None, + # ) + self.a, self.b, self.alpha_a, self.alpha_i, self.gamma = ( params.loc[:, "a"].values if "a" in params.columns else None, params.loc[:, "b"].values if "b" in params.columns else None, params.loc[:, "alpha_a"].values if "alpha_a" in params.columns else None, params.loc[:, "alpha_i"].values if "alpha_i" in params.columns else None, - params.loc[:, "beta"].values if "beta" in params.columns else None, params.loc[:, "gamma"].values if "gamma" in params.columns else None, ) if self.alpha is None: diff --git a/dynamo/tools/utils.py b/dynamo/tools/utils.py index a5b1e6707..fb3fdf603 100755 --- a/dynamo/tools/utils.py +++ b/dynamo/tools/utils.py @@ -1657,11 +1657,22 @@ def set_param_kinetic( adata.layers["cell_wise_alpha"][cur_cells_ind, valid_ind_] = alpha else: adata.var.loc[valid_ind, kin_param_pre + "alpha"] = alpha + + # to support cell-wise beta + if isarray(beta) and beta.ndim > 1: + adata.var.loc[valid_ind, kin_param_pre + "beta"] = beta.mean(1) + if cur_grp == _group[0]: + adata.layers["cell_wise_beta"] = sp.csr_matrix((adata.shape), dtype=np.float64) + beta = beta.T.tocsr() if sp.issparse(beta) else sp.csr_matrix(beta, dtype=np.float64).T + adata.layers["cell_wise_beta"][cur_cells_ind, valid_ind_] = beta + else: + adata.var.loc[valid_ind, kin_param_pre + "beta"] = beta + adata.var.loc[valid_ind, kin_param_pre + "a"] = a adata.var.loc[valid_ind, kin_param_pre + "b"] = b adata.var.loc[valid_ind, kin_param_pre + "alpha_a"] = alpha_a adata.var.loc[valid_ind, kin_param_pre + "alpha_i"] = alpha_i - adata.var.loc[valid_ind, kin_param_pre + "beta"] = beta + # adata.var.loc[valid_ind, kin_param_pre + "beta"] = beta adata.var.loc[valid_ind, kin_param_pre + "gamma"] = gamma adata.var.loc[valid_ind, kin_param_pre + "half_life"] = np.log(2) / gamma adata.var.loc[valid_ind, kin_param_pre + "cost"] = cost From eca797e4dfee9fecfb5776481c451d6fd66c7f53 Mon Sep 17 00:00:00 2001 From: QiangweiPeng Date: Mon, 19 Jun 2023 18:18:53 +0800 Subject: [PATCH 18/31] Add code to moments.py that calculates the moments needed for the storm method. --- dynamo/tools/moments.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/dynamo/tools/moments.py b/dynamo/tools/moments.py index 1cfb2ec54..d7d77cfad 100755 --- a/dynamo/tools/moments.py +++ b/dynamo/tools/moments.py @@ -173,6 +173,16 @@ def moments( ) layers = DynamoAdataKeyManager.get_available_layer_keys(adata, layers, False, False) + + # for CSP-type method + layers_raw = [ + layer + for layer in layers + if not (layer.startswith("X")) and not (layer.startswith("M")) and ( + not layer.endswith("matrix") and not layer.endswith("ambiguous")) + ] + layers_raw.sort(reverse=True) # ensure we get M_CSP_us, M_CSP_tn, etc (instead of M_CSP_su or M_CSP_nt). + layers = [ layer for layer in layers @@ -224,6 +234,30 @@ def moments( layer_x, layer_y, conn, normalize_W=normalize, mX=None, mY=None ) + # for CSP-type method + size_factor = adata.obs['Size_Factor'].astype("float").values + mapper_CSP = { + "new": "M_CSP_n", + "old": "M_CSP_o", + "total": "M_CSP_t", + "uu": "M_CSP_uu", + "ul": "M_CSP_ul", + "su": "M_CSP_su", + "sl": "M_CSP_sl", + "unspliced": "M_CSP_u", + "spliced": "M_CSP_s", + } + + # for CSP-type method + for i, layer in enumerate(layers_raw): + layer_x = adata.layers[layer].copy() + layer_x = inverse_norm(adata, layer_x) + + if mapper_CSP[layer] not in adata.layers.keys(): + local_size_factor = conn.dot(size_factor) + local_raw_counts = conn.dot(layer_x) + adata.layers[mapper_CSP[layer]] = csr_matrix(local_raw_counts/local_size_factor.reshape(-1,1)) + if "X_protein" in adata.obsm.keys(): # may need to update with mnn or just use knn from protein layer itself. adata.obsm[mapper["X_protein"]] = conn.dot(adata.obsm["X_protein"]) adata.obsp["moments_con"] = conn From e88738c3902e121ded56b2350ca61d29c24cc2e5 Mon Sep 17 00:00:00 2001 From: QiangweiPeng Date: Thu, 22 Jun 2023 00:26:13 +0800 Subject: [PATCH 19/31] Added the parameter inference method storm for one-shot experiments --- cell_cycle_CSPss.py | 124 +++++++++++ dynamo/estimation/csc/velocity.py | 357 +++++++++++++++++++----------- dynamo/tools/dynamics.py | 24 +- dynamo/tools/utils.py | 3 + scEU-seq_CellCycle_CSP_CSZIP.py | 240 ++++++++++++++++++++ scEU-seq_CellCycle_ICSP.py | 213 ++++++++++++++++++ 6 files changed, 822 insertions(+), 139 deletions(-) create mode 100644 cell_cycle_CSPss.py create mode 100644 scEU-seq_CellCycle_CSP_CSZIP.py create mode 100644 scEU-seq_CellCycle_ICSP.py diff --git a/cell_cycle_CSPss.py b/cell_cycle_CSPss.py new file mode 100644 index 000000000..8fa931025 --- /dev/null +++ b/cell_cycle_CSPss.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- +import warnings + +warnings.filterwarnings('ignore') +import dynamo as dyn + +filename = './data/rpe1.h5ad' + +rpe1 = dyn.read(filename) + +dyn.convert2float(rpe1, ['Cell_cycle_possition', 'Cell_cycle_relativePos']) + +rpe1.obs.exp_type.value_counts() + +rpe1[rpe1.obs.exp_type == 'Chase', :].obs.time.value_counts() + +rpe1[rpe1.obs.exp_type == 'Pulse', :].obs.time.value_counts() + +rpe1_kinetics = rpe1[rpe1.obs.exp_type == 'Pulse', :] +rpe1_kinetics.obs['time'] = rpe1_kinetics.obs['time'].astype(str) +rpe1_kinetics.obs.loc[rpe1_kinetics.obs['time'] == 'dmso', 'time'] = -1 +rpe1_kinetics.obs['time'] = rpe1_kinetics.obs['time'].astype(float) +rpe1_kinetics = rpe1_kinetics[rpe1_kinetics.obs.time != -1, :] +# rpe1_kinetics = rpe1_kinetics[rpe1_kinetics.obs.time > 29, :] +# rpe1_kinetics = rpe1_kinetics[rpe1_kinetics.obs.time < 31, :] +rpe1_kinetics = rpe1_kinetics[rpe1_kinetics.obs.time < 16, :] + +rpe1_kinetics.layers['new'], rpe1_kinetics.layers['total'] = rpe1_kinetics.layers['ul'] + rpe1_kinetics.layers['sl'], \ + rpe1_kinetics.layers['su'] + rpe1_kinetics.layers['sl'] + \ + rpe1_kinetics.layers['uu'] + rpe1_kinetics.layers['ul'] + +del rpe1_kinetics.layers['uu'], rpe1_kinetics.layers['ul'], rpe1_kinetics.layers['su'], rpe1_kinetics.layers['sl'] + +print(rpe1_kinetics.obs.time) + +rpe1_kinetics.obs.time = rpe1_kinetics.obs.time.astype('float') +rpe1_kinetics.obs.time = rpe1_kinetics.obs.time / 60 # convert minutes to hours + +# dyn.pp.recipe_monocle( +# rpe1_kinetics, +# tkey="time", +# experiment_type="one-shot", +# # experiment_type="kin", +# n_top_genes=1000, +# total_layers=False, +# keep_raw_layers=True, +# # feature_selection_layer="new", +# ) +# dyn.tl.dynamics(rpe1_kinetics, +# model="deterministic", +# # est_method='CSP4ML_CSPss' +# ) + +from dynamo.tools.dynamics import dynamics_wrapper +from dynamo.tools.dimension_reduction import reduceDimension +from dynamo.tools.cell_velocities import cell_velocities +from dynamo.preprocessing.utils import ( + del_raw_layers, + detect_experiment_datatype, + reset_adata_X, +) +from dynamo.preprocessing import Preprocessor + +keep_filtered_cells = False +keep_filtered_genes = True +keep_raw_layers = True +del_2nd_moments = True +has_splicing, has_labeling, splicing_labeling = False, True, False +if has_splicing and has_labeling and splicing_labeling: + layers = ["X_new", "X_total", "X_uu", "X_ul", "X_su", "X_sl"] +elif has_labeling: + layers = ["X_new", "X_total"] + +# Preprocessing +preprocessor = Preprocessor(cell_cycle_score_enable=True) +preprocessor.config_monocle_recipe(rpe1_kinetics, n_top_genes=1000) +preprocessor.size_factor_kwargs.update( + { + "X_total_layers": False, + "splicing_total_layers": False, + } +) +preprocessor.normalize_by_cells_function_kwargs.update( + { + "X_total_layers": False, + "splicing_total_layers": False, + "keep_filtered": keep_filtered_genes, + "total_szfactor": "total_Size_Factor", + } +) +preprocessor.filter_cells_by_outliers_kwargs["keep_filtered"] = keep_filtered_cells +preprocessor.select_genes_kwargs["keep_filtered"] = keep_filtered_genes +if True: + reset_adata_X(rpe1_kinetics, experiment_type="one-shot", has_labeling=has_labeling, has_splicing=has_splicing) +preprocessor.preprocess_adata_monocle(adata=rpe1_kinetics, tkey='time', experiment_type="one-shot") +if not keep_raw_layers: + del_raw_layers(rpe1_kinetics) + +from dynamo.tools.dynamics import dynamics_wrapper +dynamics_wrapper( + rpe1_kinetics, + model="stochastic", + del_2nd_moments=del_2nd_moments, + assumption_mRNA='ss', + one_shot_method='storm-csp', +) +reduceDimension(rpe1_kinetics, reduction_method='umap') +cell_velocities(rpe1_kinetics, enforce=True, vkey='velocity_T', ekey='M_t', basis='umap') + + +rpe1_kinetics.obsm['X_RFP_GFP'] = rpe1_kinetics.obs.loc[:, + ['RFP_log10_corrected', 'GFP_log10_corrected']].values.astype('float') + +dyn.tl.reduceDimension(rpe1_kinetics, reduction_method='umap') +dyn.tl.cell_velocities(rpe1_kinetics, enforce=True, vkey='velocity_T', ekey='M_t', basis='RFP_GFP') +dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], basis='RFP_GFP', save_show_or_return='show') + + +# path = './figures_new/figure4/' +# figsize = (6, 4) +# dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], basis='RFP_GFP', save_show_or_return='show', +# save_kwargs={'prefix': 'cell_cycle_rfp_gfp_15mins_dynamo', 'ext': 'png', +# "bbox_inches": None, 'dpi': 600, 'path': path}, figsize=figsize) diff --git a/dynamo/estimation/csc/velocity.py b/dynamo/estimation/csc/velocity.py index 6a6a0a52b..35c79c702 100755 --- a/dynamo/estimation/csc/velocity.py +++ b/dynamo/estimation/csc/velocity.py @@ -2,6 +2,7 @@ from multiprocessing.dummy import Pool as ThreadPool from warnings import warn +import numpy as np from scipy.sparse import csr_matrix from tqdm import tqdm @@ -17,6 +18,7 @@ ) from .utils_velocity import * + # from sklearn.cluster import KMeans # from sklearn.neighbors import NearestNeighbors @@ -43,14 +45,14 @@ class Velocity: """ def __init__( - self, - alpha=None, - beta=None, - gamma=None, - eta=None, - delta=None, - t=None, - estimation=None, + self, + alpha=None, + beta=None, + gamma=None, + eta=None, + delta=None, + t=None, + estimation=None, ): if estimation is not None: self.parameters = {} @@ -398,25 +400,28 @@ class ss_estimation: """ def __init__( - self, - U=None, - Ul=None, - S=None, - Sl=None, - P=None, - US=None, - S2=None, - conn=None, - t=None, - ind_for_proteins=None, - model="stochastic", - est_method="gmm", - experiment_type="deg", - assumption_mRNA=None, - assumption_protein="ss", - concat_data=True, - cores=1, - **kwargs + self, + U=None, + Ul=None, + S=None, + Sl=None, + P=None, + US=None, + S2=None, + NewCounts=None, + TotalCounts=None, + NewSmoothCSP=None, + conn=None, + t=None, + ind_for_proteins=None, + model="stochastic", + est_method="gmm", + experiment_type="deg", + assumption_mRNA=None, + assumption_protein="ss", + concat_data=True, + cores=1, + **kwargs ): self.t = t @@ -428,6 +433,9 @@ def __init__( "p": P, "us": US, "s2": S2, + "new_counts": NewCounts, + "total_counts": TotalCounts, + "new_smooth_csp": NewSmoothCSP, } # U: (unlabeled) unspliced; S: (unlabeled) spliced; U / Ul: old and labeled; U, Ul, S, Sl: uu/ul/su/sl if concat_data: self.concatenate_data() @@ -470,12 +478,12 @@ def __init__( self.ind_for_proteins = ind_for_proteins def fit( - self, - intercept=False, - perc_left=None, - perc_right=5, - clusters=None, - one_shot_method="combined", + self, + intercept=False, + perc_left=None, + perc_right=5, + clusters=None, + one_shot_method="combined", ): """Fit the input data to estimate all or a subset of the parameters @@ -973,7 +981,7 @@ def fit( np.zeros(n_genes), ) for i in range( - n_genes + n_genes ): # can also use the two extreme time points and apply sci-fate like approach. S, U = ( self.data["su"][i] + self.data["sl"][i], @@ -1187,8 +1195,8 @@ def fit( ) if cores == 1: for i in tqdm( - range(n_genes), - desc="estimating beta and alpha for one-shot experiment", + range(n_genes), + desc="estimating beta and alpha for one-shot experiment", ): ( k[i], @@ -1265,8 +1273,8 @@ def fit( ) if cores == 1: for i in tqdm( - range(n_genes), - desc="estimating gamma and alpha for one-shot experiment", + range(n_genes), + desc="estimating gamma and alpha for one-shot experiment", ): ( k[i], @@ -1335,94 +1343,135 @@ def fit( bf, ) elif np.all(self._exist_data("uu", "ul")): - k, k_intercept, k_r2, k_logLL, bs, bf = ( - np.zeros(n_genes), - np.zeros(n_genes), - np.zeros(n_genes), - np.zeros(n_genes), - np.zeros(n_genes), - np.zeros(n_genes), - ) - U = self.data["ul"] - S = self.data["ul"] + self.data["uu"] - US = ( - self.data["us"] - if self.data["us"] is not None - else calc_2nd_moment(U.T, S.T, self.conn, mX=U.T, mY=S.T).T - ) - S2 = ( - self.data["s2"] - if self.data["s2"] is not None - else calc_2nd_moment(S.T, S.T, self.conn, mX=S.T, mY=S.T).T - ) - if cores == 1: - for i in tqdm(range(n_genes), desc="estimating gamma"): + if one_shot_method == "storm-csp": + gamma, gamma_r2, k = ( + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + ) + new_counts = self.data["new_counts"] + total_counts = self.data["total_counts"] + new_smooth_csp = self.data["new_smooth_csp"] + new_smooth = self.data['ul'] + total_smooth = self.data["ul"] + self.data["uu"] + for i in tqdm(range(n_genes), desc="estimating gamma via storm's csp model"): ( + gamma[i], + gamma_r2[i], k[i], - k_intercept[i], - _, - k_r2[i], - _, - k_logLL[i], - bs[i], - bf[i], - ) = self.fit_gamma_stochastic( - self.est_method, - U[i], - S[i], - US[i], - S2[i], + ) = self.fit_gamma_storm_csp( + new_counts[i], + total_counts[i], + new_smooth[i], + total_smooth[i], + t_uniq=t_uniq, perc_left=perc_left, perc_right=perc_right, normalize=True, ) + _, alpha = one_shot_gamma_alpha_matrix(k, t_uniq, new_smooth_csp) + ( + self.parameters["alpha"], + self.parameters["gamma"], + self.aux_param["gamma_k"], + self.aux_param["gamma_intercept"], + self.aux_param["gamma_r2"], + ) = ( + alpha, + gamma, + k, + np.zeros(n_genes), + gamma_r2, + ) else: - pool = ThreadPool(cores) - res = pool.starmap( - self.fit_gamma_stochastic, - zip( - itertools.repeat(self.est_method), - U, - S, - US, - S2, - itertools.repeat(perc_left), - itertools.repeat(perc_right), - itertools.repeat(True), - ), + k, k_intercept, k_r2, k_logLL, bs, bf = ( + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), ) - pool.close() - pool.join() - (k, k_intercept, _, k_r2, _, k_logLL, bs, bf) = zip(*res) - (k, k_intercept, k_r2, k_logLL, bs, bf) = ( - np.array(k), - np.array(k_intercept), - np.array(k_r2), - np.array(k_logLL), - np.array(bs), - np.array(bf), + U = self.data["ul"] + S = self.data["ul"] + self.data["uu"] + US = ( + self.data["us"] + if self.data["us"] is not None + else calc_2nd_moment(U.T, S.T, self.conn, mX=U.T, mY=S.T).T + ) + S2 = ( + self.data["s2"] + if self.data["s2"] is not None + else calc_2nd_moment(S.T, S.T, self.conn, mX=S.T, mY=S.T).T ) + if cores == 1: + for i in tqdm(range(n_genes), desc="estimating gamma"): + ( + k[i], + k_intercept[i], + _, + k_r2[i], + _, + k_logLL[i], + bs[i], + bf[i], + ) = self.fit_gamma_stochastic( + self.est_method, + U[i], + S[i], + US[i], + S2[i], + perc_left=perc_left, + perc_right=perc_right, + normalize=True, + ) + else: + pool = ThreadPool(cores) + res = pool.starmap( + self.fit_gamma_stochastic, + zip( + itertools.repeat(self.est_method), + U, + S, + US, + S2, + itertools.repeat(perc_left), + itertools.repeat(perc_right), + itertools.repeat(True), + ), + ) + pool.close() + pool.join() + (k, k_intercept, _, k_r2, _, k_logLL, bs, bf) = zip(*res) + (k, k_intercept, k_r2, k_logLL, bs, bf) = ( + np.array(k), + np.array(k_intercept), + np.array(k_r2), + np.array(k_logLL), + np.array(bs), + np.array(bf), + ) - gamma, alpha = one_shot_gamma_alpha_matrix(k, t_uniq, U) - ( - self.parameters["alpha"], - self.parameters["gamma"], - self.aux_param["gamma_k"], - self.aux_param["gamma_intercept"], - self.aux_param["gamma_r2"], - self.aux_param["gamma_logLL"], - self.aux_param["bs"], - self.aux_param["bf"], - ) = ( - alpha, - gamma, - k, - k_intercept, - k_r2, - k_logLL, - bs, - bf, - ) + gamma, alpha = one_shot_gamma_alpha_matrix(k, t_uniq, U) + ( + self.parameters["alpha"], + self.parameters["gamma"], + self.aux_param["gamma_k"], + self.aux_param["gamma_intercept"], + self.aux_param["gamma_r2"], + self.aux_param["gamma_logLL"], + self.aux_param["bs"], + self.aux_param["bf"], + ) = ( + alpha, + gamma, + k, + k_intercept, + k_r2, + k_logLL, + bs, + bf, + ) elif self.extyp.lower() == "mix_std_stm": t_min, t_max = np.min(self.t), np.max(self.t) if np.all(self._exist_data("ul", "uu", "su")): @@ -1433,13 +1482,13 @@ def fit( np.zeros(n_genes), ) for i in tqdm( - range(n_genes), desc="solving gamma/beta" + range(n_genes), desc="solving gamma/beta" ): # can also use the two extreme time points and apply sci-fate like approach. tmp = ( - self.data["uu"][i, self.t == t_max] - + self.data["ul"][i, self.t == t_max] - + self.data["su"][i, self.t == t_max] - + self.data["sl"][i, self.t == t_max] + self.data["uu"][i, self.t == t_max] + + self.data["ul"][i, self.t == t_max] + + self.data["su"][i, self.t == t_max] + + self.data["sl"][i, self.t == t_max] ) total[i] = np.mean(tmp) gamma[i] = solve_gamma( @@ -1470,7 +1519,7 @@ def fit( n_genes = self.data["uu"].shape[0] # self.get_n_genes(data=U) gamma, U = np.zeros(n_genes), np.zeros(n_genes) for i in tqdm( - range(n_genes), desc="solving gamma, alpha" + range(n_genes), desc="solving gamma, alpha" ): # apply sci-fate like approach (can also use one-single time point to estimate gamma) # tmp = self.data['uu'][i, self.t == 0] + self.data['ul'][i, self.t == 0] tmp_ = self.data["uu"][i, self.t == t_max] + self.data["ul"][i, self.t == t_max] @@ -1614,16 +1663,58 @@ def fit_gamma_steady_state(self, u, s, intercept=True, perc_left=None, perc_righ return k, b, r2, all_r2, logLL, all_logLL + def fit_gamma_storm_csp( + self, + new_counts, + total_counts, + new_smooth, + total_smooth, + t_uniq, + perc_left=None, + perc_right=50, + normalize=True, + ): + """Estimate gamma using Storm's CSP model based on the steady state assumption. + + Arguments + --------- + new_counts: :class:`~numpy.ndarray` or sparse `csr_matrix` + A matrix of new mRNA raw counts. Dimension: genes x cells. + total_counts: :class:`~numpy.ndarray` or sparse `csr_matrix` + A matrix of total mRNA raw counts. Dimension: genes x cells. + new_smooth: :class:`~numpy.ndarray` or sparse `csr_matrix` + A matrix of new mRNA smoothed data. Dimension: genes x cells. + total_smooth: :class:`~numpy.ndarray` or sparse `csr_matrix` + A matrix of total mRNA smoothed data. Dimension: genes x cells. + t_uniq: : float + The labeling duration of one-shot experiment. + perc_left: float + The percentage of samples included in the linear regression in the left tail. If set to None, then all the left samples are excluded. + perc_right: float + The percentage of samples included in the linear regression in the right tail. If set to None, then all the samples are included. + normalize: bool + Whether to first normalize the + """ + new_counts = new_counts.A.flatten() if issparse(new_counts) else new_counts.flatten() + total_counts = total_counts.A.flatten() if issparse(total_counts) else total_counts.flatten() + new_smooth = new_smooth.A.flatten() if issparse(new_smooth) else new_smooth.flatten() + total_smooth = total_smooth.A.flatten() if issparse(total_smooth) else total_smooth.flatten() + mask = find_extreme(new_smooth, total_smooth, perc_left=perc_left, perc_right=perc_right, normalize=normalize) + gamma = - np.log(1 - np.mean(new_counts[mask]) / np.mean(total_counts[mask])) / t_uniq + gamma_r2 = 1.0 + k = 1 - np.exp(-gamma*t_uniq) + return gamma, gamma_r2, k + def fit_gamma_stochastic( - self, - est_method, - u, - s, - us, - ss, - perc_left=None, - perc_right=5, - normalize=True, + self, + est_method, + u, + s, + us, + ss, + perc_left=None, + perc_right=5, + normalize=True, ): """Estimate gamma using GMM (generalized method of moments) or negbin distrubtion based on the steady state assumption. @@ -1805,8 +1896,8 @@ def solve_alpha_mix_std_stm(self, t, ul, beta, clusters=None, alpha_time_depende alpha_std, alpha_stm = alpha_std_ini, np.zeros((ul.shape[0], len(t_uniq))) alpha_stm[:, 0] = alpha_std_ini # 0 stimulation point is the steady state transcription for i in tqdm( - range(ul.shape[0]), - desc="solving steady state alpha and induction alpha", + range(ul.shape[0]), + desc="solving steady state alpha and induction alpha", ): l = ul[i].A.flatten() if issparse(ul) else ul[i] for t_ind in np.arange(1, len(t_uniq)): diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index f86104ab6..ed25c0e7f 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -316,6 +316,16 @@ def _estimate_params_ss(self, subset_adata: AnnData, **est_params_args): else: ss_estimation_kwargs = {} + if self.one_shot_method == "storm-csp": + _, valid_bools, _ = self._filter() + self.NewCounts = self.adata[:, valid_bools].layers['new'].T + self.TotalCounts = self.adata[:, valid_bools].layers['total'].T + self.NewSmoothCSP = self.adata[:, valid_bools].layers['M_CSP_n'].T + else: + self.NewCounts = None + self.TotalCounts = None + self.NewSmoothCSP = None + self.est = ss_estimation( U=self.U.copy() if self.U is not None else None, Ul=self.Ul.copy() if self.Ul is not None else None, @@ -324,6 +334,9 @@ def _estimate_params_ss(self, subset_adata: AnnData, **est_params_args): P=self.P.copy() if self.P is not None else None, US=self.US.copy() if self.US is not None else None, S2=self.S2.copy() if self.S2 is not None else None, + NewCounts=self.NewCounts.copy() if self.NewCounts is not None else None, + TotalCounts=self.TotalCounts.copy() if self.TotalCounts is not None else None, + NewSmoothCSP=self.NewSmoothCSP.copy() if self.NewSmoothCSP is not None else None, conn=subset_adata.obsp["moments_con"], t=self.t, ind_for_proteins=self.ind_for_proteins, @@ -341,7 +354,10 @@ def _estimate_params_ss(self, subset_adata: AnnData, **est_params_args): warnings.simplefilter("ignore") if self.experiment_type.lower() in ["one-shot", "one_shot"]: - self.est.fit(one_shot_method=self.one_shot_method, **self.est_kwargs) + if self.one_shot_method == "storm-csp": + self.est.fit(one_shot_method=self.one_shot_method, perc_right=50, **self.est_kwargs) + else: + self.est.fit(one_shot_method=self.one_shot_method, **self.est_kwargs) else: # experiment_type can be `kin` also and by default use # conventional method to estimate k but correct for time @@ -1417,11 +1433,7 @@ def dynamics_wrapper( if experiment_type == "conventional": estimator = SplicedDynamics(dynamics_kwargs) elif experiment_type in ["one-shot", "one_shot"]: - if model == 'deterministic': - estimator = OneShotDynamics(dynamics_kwargs) - elif model == 'stochastic': - dynamics_kwargs['est_method'] = 'storm-csp' - estimator = OneShotDynamics(dynamics_kwargs) + estimator = OneShotDynamics(dynamics_kwargs) elif experiment_type == "kin": if assumption_mRNA == "ss": estimator = SSKineticsDynamics(dynamics_kwargs) diff --git a/dynamo/tools/utils.py b/dynamo/tools/utils.py index fb3fdf603..90f5c699b 100755 --- a/dynamo/tools/utils.py +++ b/dynamo/tools/utils.py @@ -2269,6 +2269,9 @@ def set_transition_genes( # min_r2 = 0.5 if min_r2 is None else min_r2 # else: min_r2 = 0.9 if min_r2 is None else min_r2 + elif "storm" in adata.uns["dynamics"]["est_method"] and adata.uns["dynamics"]["experiment_type"] == "kin": + # for storm method + min_r2 = 0.9 if min_r2 is None else min_r2 elif adata.uns["dynamics"]["experiment_type"] in [ "mix_kin_deg", "mix_pulse_chase", diff --git a/scEU-seq_CellCycle_CSP_CSZIP.py b/scEU-seq_CellCycle_CSP_CSZIP.py new file mode 100644 index 000000000..f1a9ac691 --- /dev/null +++ b/scEU-seq_CellCycle_CSP_CSZIP.py @@ -0,0 +1,240 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import warnings + +warnings.filterwarnings('ignore') +import dynamo as dyn + +filename = './data/rpe1.h5ad' + +rpe1 = dyn.read(filename) + +dyn.convert2float(rpe1, ['Cell_cycle_possition', 'Cell_cycle_relativePos']) + +rpe1.obs.exp_type.value_counts() + +rpe1[rpe1.obs.exp_type == 'Chase', :].obs.time.value_counts() + +rpe1[rpe1.obs.exp_type == 'Pulse', :].obs.time.value_counts() + +rpe1_kinetics = rpe1[rpe1.obs.exp_type == 'Pulse', :] +rpe1_kinetics.obs['time'] = rpe1_kinetics.obs['time'].astype(str) +rpe1_kinetics.obs.loc[rpe1_kinetics.obs['time'] == 'dmso', 'time'] = -1 +rpe1_kinetics.obs['time'] = rpe1_kinetics.obs['time'].astype(float) +rpe1_kinetics = rpe1_kinetics[rpe1_kinetics.obs.time != -1, :] + +rpe1_kinetics.layers['new'], rpe1_kinetics.layers['total'] = rpe1_kinetics.layers['ul'] + rpe1_kinetics.layers['sl'], \ + rpe1_kinetics.layers['su'] + rpe1_kinetics.layers['sl'] + \ + rpe1_kinetics.layers['uu'] + rpe1_kinetics.layers['ul'] + +del rpe1_kinetics.layers['uu'], rpe1_kinetics.layers['ul'], rpe1_kinetics.layers['su'], rpe1_kinetics.layers['sl'] + +print(rpe1_kinetics.obs.time) + +rpe1_kinetics.obs.time = rpe1_kinetics.obs.time.astype('float') +rpe1_kinetics.obs.time = rpe1_kinetics.obs.time / 60 # convert minutes to hours + +# # velocity +# dyn.tl.recipe_kin_data(adata=rpe1_kinetics, +# keep_filtered_genes=True, +# keep_raw_layers=True, +# del_2nd_moments=False, +# tkey='time', +# n_top_genes=1000, +# ) + +from dynamo.tools.dynamics import dynamics_wrapper +from dynamo.tools.dimension_reduction import reduceDimension +from dynamo.tools.cell_velocities import cell_velocities +from dynamo.preprocessing.utils import ( + del_raw_layers, + detect_experiment_datatype, + reset_adata_X, +) +from dynamo.preprocessing import Preprocessor + +keep_filtered_cells = False +keep_filtered_genes = True +keep_raw_layers = True +del_2nd_moments = True +has_splicing, has_labeling, splicing_labeling = False, True, False +if has_splicing and has_labeling and splicing_labeling: + layers = ["X_new", "X_total", "X_uu", "X_ul", "X_su", "X_sl"] +elif has_labeling: + layers = ["X_new", "X_total"] + +# Preprocessing +preprocessor = Preprocessor(cell_cycle_score_enable=True) +preprocessor.config_monocle_recipe(rpe1_kinetics, n_top_genes=1000) +preprocessor.size_factor_kwargs.update( + { + "X_total_layers": False, + "splicing_total_layers": False, + } +) +preprocessor.normalize_by_cells_function_kwargs.update( + { + "X_total_layers": False, + "splicing_total_layers": False, + "keep_filtered": keep_filtered_genes, + "total_szfactor": "total_Size_Factor", + } +) +preprocessor.filter_cells_by_outliers_kwargs["keep_filtered"] = keep_filtered_cells +preprocessor.select_genes_kwargs["keep_filtered"] = keep_filtered_genes +if True: + reset_adata_X(rpe1_kinetics, experiment_type="kin", has_labeling=has_labeling, has_splicing=has_splicing) +preprocessor.preprocess_adata_monocle(adata=rpe1_kinetics, tkey='time', experiment_type="kin") +if not keep_raw_layers: + del_raw_layers(rpe1_kinetics) + +dynamics_wrapper(rpe1_kinetics, model="stochastic", est_method="storm-cszip", del_2nd_moments=del_2nd_moments) +reduceDimension(rpe1_kinetics, reduction_method='umap') +cell_velocities(rpe1_kinetics, basis='umap') + + +# dyn.tl.gene_wise_confidence(adata=rpe1_kinetics, +# group='cell_cycle_phase', +# lineage_dict={'M': 'G2-M'}, +# ekey='M_t', +# vkey='velocity_T' +# ) + +rpe1_kinetics.obsm['X_RFP_GFP'] = rpe1_kinetics.obs.loc[:, + ['RFP_log10_corrected', 'GFP_log10_corrected']].values.astype('float') + +dyn.tl.reduceDimension(rpe1_kinetics, reduction_method='umap') +dyn.tl.cell_velocities(rpe1_kinetics, enforce=True, vkey='velocity_T', ekey='M_t', basis='RFP_GFP') +dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], basis='RFP_GFP') + +# # # for velocity gene-wise parameters +# # import matplotlib.pyplot as plt +# # import scanpy as sc +# # sc.set_figure_params(scanpy=True, fontsize=6) +# # plt.rcParams['font.size'] = '6' +# # dyn.configuration.set_figure_params(dpi_save=600, figsize=(17 / 3 / 2.54, 17 / 3 / 2.54 * (4 / 6))) +# # +# # save_path = './cell_wise_figures/' +# # dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], vkey='velocity_T', ekey='M_t', basis='RFP_GFP', +# # save_show_or_return='show', +# # save_kwargs={"path": save_path, "prefix": 'cszip_stream_gene-wise_alpha', "dpi": 600, 'ext':'png'}) +# +# +# import scvelo as scv +# import matplotlib.pyplot as plt +# +# plt.rcParams['font.size'] = '7' +# dpi = 600 +# figsize = (6, 3) +# +# well_fitted = rpe1_kinetics.var['gamma_r2'] > 0 +# print(well_fitted) +# well_fitted_genes = well_fitted[well_fitted].index +# # well_fitted_genes = rpe1_kinetics.var['gamma_r2_raw'].sort_values(ascending=False).index[:1000] +# save_path = './cell_wise_figures/csp_alpha.png' +# # save_path = './cell_wise_figures/cszip_alpha_m_p_on.png' +# +# from dynamo.preprocessing.cell_cycle import get_cell_phase_genes +# +# cell_cycle_genes = get_cell_phase_genes(rpe1_kinetics, None) +# print(cell_cycle_genes) +# +# # yticklabels = [None]*len(well_fitted_genes) +# +# +# ax = scv.pl.heatmap(rpe1_kinetics, +# var_names=well_fitted_genes, +# sortby='Cell_cycle_relativePos', +# col_color='cell_cycle_phase', +# n_convolve=100, +# layer='cell_wise_alpha', +# figsize=(6, 3), +# show=False, +# colorbar=True, +# cbar_pos=(0.12, 0.4, 0.05, 0.18) +# # yticklabels=yticklabels +# ) +# # plt.colorbar() +# # plt.savefig(save_path, dpi=dpi, figsize=figsize) +# plt.show() +# +# # # genes = ['HMGA2', 'DCBLD2', 'HIPK2'] +# # dyn.configuration.set_figure_params(fontsize=6) +# # genes = ['HMGA2'] +# # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_T', +# # ekey='M_t', show_arrowed_spines=False, show_quiver=True, quiver_size=5, figsize=(6*0.53, 4*0.53)) +# # genes = ['DCBLD2'] +# # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_T', +# # ekey='M_t', show_arrowed_spines=False, show_quiver=True, quiver_size=5, figsize=(6*0.53, 4*0.53)) +# # genes = ['HIPK2'] +# # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_T', +# # ekey='M_t', show_arrowed_spines=False, show_quiver=True, quiver_size=5, figsize=(6*0.53, 4*0.53)) +# +# +# # dyn.vf.VectorField(rpe1_kinetics, basis='RFP_GFP', map_topography=True, M=100) +# # import matplotlib.pyplot as plt +# # import numpy as np +# # +# # fig, ax = plt.subplots() +# # ax = dyn.pl.topography(rpe1_kinetics, basis='RFP_GFP', color='Cell_cycle_relativePos', ax=ax, +# # save_show_or_return='show', fps_basis='RFP_GFP') +# +# # # dyn.tl.cell_velocities(rpe1_kinetics, basis='pca') +# # # dyn.vf.VectorField(rpe1_kinetics, basis='pca', M=100) +# # # dyn.pp.top_pca_genes(rpe1_kinetics, n_top_genes=100) +# # # top_pca_genes = rpe1_kinetics.var.index[rpe1_kinetics.var.top_pca_genes] +# # # top_pca_genes = ["CDK4", "CDK6", "CDK2", "SKP2", "WEE1", "CDK1", "CDC20"] + list(top_pca_genes) +# # +# # dyn.tl.cell_velocities(rpe1_kinetics, basis='pca') +# # dyn.vf.VectorField(rpe1_kinetics, basis='pca', M=100) +# # # top_pca_genes = rpe1_kinetics[:, rpe1_kinetics.var['use_for_transition']].var.index.tolist() +# # # top_pca_genes = ["CDK4", "CDK6", "CDK2", "SKP2", "WEE1", "CDK1", "CDC20"] + list(top_pca_genes) +# # top_pca_genes = ["CDK4", "CDK6", "CDK2", "SKP2", "WEE1", "CDK1", "CDC20"] +# # +# # dyn.vf.jacobian(rpe1_kinetics, regulators=top_pca_genes, effectors=top_pca_genes) +# # dyn.pl.jacobian( +# # rpe1_kinetics, +# # regulators=top_pca_genes, +# # effectors=top_pca_genes, +# # basis="RFP_GFP", +# # ) +# # +# # divergence_rank = dyn.vf.rank_divergence_genes(rpe1_kinetics, groups='cell_cycle_phase') +# # dyn.vf.rank_jacobian_genes(rpe1_kinetics, groups='cell_cycle_phase') +# # +# # full_reg_rank = dyn.vf.rank_jacobian_genes(rpe1_kinetics, +# # groups='cell_cycle_phase', +# # mode="full_reg", +# # abs=True, +# # output_values=True, +# # return_df=True) +# # full_eff_rank = dyn.vf.rank_jacobian_genes(rpe1_kinetics, +# # groups='cell_cycle_phase', +# # mode='full_eff', +# # abs=True, +# # exclude_diagonal=True, +# # output_values=True, +# # return_df=True) +# # # print(full_reg_rank['G2-M']) +# # # print(full_reg_rank['G2-M'].head(2)) +# # +# # # unknown_cell_type_regulators = ["E2F", "Cdk4", "Cdk6", "pRB", "pRBp", "pRBpp", "Cdk2", +# # # "Skp2", "Wee1", "Cdh1", "Cdc25", "Cdk1", "Cdc20"] +# # unknown_cell_type_regulators = ["CDK4", "CDK6", "CDK2", "SKP2", "WEE1", "CDK1", "CDC20"] +# # +# # edges_list = dyn.vf.build_network_per_cluster(rpe1_kinetics, +# # cluster='cell_cycle_phase', +# # cluster_names=None, +# # full_reg_rank=full_reg_rank, +# # full_eff_rank=full_eff_rank, +# # genes=np.unique(unknown_cell_type_regulators), +# # n_top_genes=100) +# # +# # import networkx as nx +# # +# # print(edges_list) +# # network = nx.from_pandas_edgelist(edges_list['G1-S'], 'regulator', 'target', edge_attr='weight', +# # create_using=nx.DiGraph()) +# # ax = dyn.pl.arcPlot(rpe1_kinetics, cluster='cell_cycle_phase', cluster_name="G1-S", edges_list=None, +# # network=network, color="M_t", save_show_or_return='show') diff --git a/scEU-seq_CellCycle_ICSP.py b/scEU-seq_CellCycle_ICSP.py new file mode 100644 index 000000000..d0f26a4eb --- /dev/null +++ b/scEU-seq_CellCycle_ICSP.py @@ -0,0 +1,213 @@ +#!/usr/bin/env python +# -*- coding:utf-8 -*- + +import warnings + +warnings.filterwarnings('ignore') + +import dynamo as dyn + +filename = './data/rpe1.h5ad' + +rpe1 = dyn.read(filename) + +dyn.convert2float(rpe1, ['Cell_cycle_possition', 'Cell_cycle_relativePos']) + +rpe1.obs.exp_type.value_counts() + +rpe1[rpe1.obs.exp_type == 'Chase', :].obs.time.value_counts() + +rpe1[rpe1.obs.exp_type == 'Pulse', :].obs.time.value_counts() + +rpe1_kinetics = rpe1[rpe1.obs.exp_type == 'Pulse', :] +rpe1_kinetics.obs['time'] = rpe1_kinetics.obs['time'].astype(str) +rpe1_kinetics.obs.loc[rpe1_kinetics.obs['time'] == 'dmso', 'time'] = -1 +rpe1_kinetics.obs['time'] = rpe1_kinetics.obs['time'].astype(float) +rpe1_kinetics = rpe1_kinetics[rpe1_kinetics.obs.time != -1, :] + +rpe1_genes = ['UNG', 'PCNA', 'PLK1', 'HPRT1'] + +rpe1_kinetics.obs.time = rpe1_kinetics.obs.time.astype('float') +rpe1_kinetics.obs.time = rpe1_kinetics.obs.time / 60 # convert minutes to hours + +print(rpe1_kinetics.obs.time.value_counts()) + +# from dynamo.tools.recipes import recipe_kin_data +# # velocity +# recipe_kin_data(adata=rpe1_kinetics, +# keep_filtered_genes=True, +# keep_raw_layers=True, +# del_2nd_moments=True, +# tkey='time', +# n_top_genes=1000, +# # est_method='twostep', +# ) + +from dynamo.tools.dynamics import dynamics_wrapper +from dynamo.tools.dimension_reduction import reduceDimension +from dynamo.tools.cell_velocities import cell_velocities +from dynamo.preprocessing.utils import ( + del_raw_layers, + detect_experiment_datatype, + reset_adata_X, + collapse_species_adata +) +from dynamo.preprocessing import Preprocessor +from dynamo.tools.moments import moments +from dynamo.preprocessing.pca import pca +from dynamo.tools.connectivity import neighbors,normalize_knn_graph +import numpy as np + + +keep_filtered_cells = False +keep_filtered_genes = False +keep_raw_layers = True +del_2nd_moments = True +has_splicing, has_labeling, splicing_labeling = True, True, True +if has_splicing and has_labeling and splicing_labeling: + layers = ["X_new", "X_total", "X_uu", "X_ul", "X_su", "X_sl"] +elif has_labeling: + layers = ["X_new", "X_total"] + +# Preprocessing +preprocessor = Preprocessor(cell_cycle_score_enable=True) +preprocessor.config_monocle_recipe(rpe1_kinetics, n_top_genes=1000) +preprocessor.size_factor_kwargs.update( + { + "X_total_layers": False, + "splicing_total_layers": False, + } +) +preprocessor.normalize_by_cells_function_kwargs.update( + { + "X_total_layers": False, + "splicing_total_layers": False, + "keep_filtered": keep_filtered_genes, + "total_szfactor": "total_Size_Factor", + } +) +preprocessor.filter_cells_by_outliers_kwargs["keep_filtered"] = keep_filtered_cells +preprocessor.select_genes_kwargs["keep_filtered"] = keep_filtered_genes + +rpe1_kinetics = collapse_species_adata(rpe1_kinetics) +if True: + reset_adata_X(rpe1_kinetics, experiment_type="kin", has_labeling=has_labeling, has_splicing=has_splicing) +preprocessor.preprocess_adata_monocle(adata=rpe1_kinetics, tkey='time', experiment_type="kin") +if not keep_raw_layers: + del_raw_layers(rpe1_kinetics) + +tkey = rpe1_kinetics.uns["pp"]["tkey"] +# first calculate moments for labeling data relevant layers using total based connectivity graph +moments(rpe1_kinetics, group=tkey, layers=layers) + +# then we want to calculate moments for spliced and unspliced layers based on connectivity graph from spliced +# data. +# first get X_spliced based pca embedding +CM = np.log1p(rpe1_kinetics[:, rpe1_kinetics.var.use_for_pca].layers["X_spliced"].A) +cm_genesums = CM.sum(axis=0) +valid_ind = np.logical_and(np.isfinite(cm_genesums), cm_genesums != 0) +valid_ind = np.array(valid_ind).flatten() + +pca(rpe1_kinetics, CM[:, valid_ind], pca_key="X_spliced_pca") +# then get neighbors graph based on X_spliced_pca +neighbors(rpe1_kinetics, X_data=rpe1_kinetics.obsm["X_spliced_pca"], layer="X_spliced") +# then normalize neighbors graph so that each row sums up to be 1 +conn = normalize_knn_graph(rpe1_kinetics.obsp["connectivities"] > 0) +# then calculate moments for spliced related layers using spliced based connectivity graph +moments(rpe1_kinetics, conn=conn, layers=["X_spliced", "X_unspliced"]) +# then perform kinetic estimations with properly preprocessed layers for either the labeling or the splicing +# data +moments(rpe1_kinetics, conn=conn, layers=["uu", "ul", "su", "sl", "new", "total"]) + +dynamics_wrapper(rpe1_kinetics, model="stochastic", est_method="storm-icsp", del_2nd_moments=del_2nd_moments) +reduceDimension(rpe1_kinetics, reduction_method='umap') +cell_velocities(rpe1_kinetics, basis='umap') + +rpe1_kinetics.obsm['X_RFP_GFP'] = rpe1_kinetics.obs.loc[:, + ['RFP_log10_corrected', 'GFP_log10_corrected']].values.astype('float') + +# total velocity +dyn.tl.reduceDimension(rpe1_kinetics, reduction_method='umap') +dyn.tl.cell_velocities(rpe1_kinetics, enforce=True, vkey='velocity_T', ekey='M_t', basis='RFP_GFP') +dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], basis='RFP_GFP') + +# spliced RNA velocity +dyn.tl.reduceDimension(rpe1_kinetics, reduction_method='umap') +dyn.tl.cell_velocities(rpe1_kinetics, enforce=True, vkey='velocity_S', ekey='M_s', basis='RFP_GFP') +dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], basis='RFP_GFP') + +# # for velocity gene-wise parameters +# import matplotlib.pyplot as plt +# import scanpy as sc +# sc.set_figure_params(scanpy=True, fontsize=6) +# plt.rcParams['font.size'] = '6' +# dyn.configuration.set_figure_params(dpi_save=600, figsize=(17 / 3 / 2.54, 17 / 3 / 2.54 * (4 / 6))) +# +# save_path = './cell_wise_figures/' +# dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], vkey='velocity_T', ekey='M_t', basis='RFP_GFP', +# save_show_or_return='show', +# save_kwargs={"path": save_path, "prefix": 'icsp_vt_stream_gene-wise_alpha_beta', "dpi": 600, 'ext':'png'}) +# +# +# # spliced RNA velocity +# dyn.tl.reduceDimension(rpe1_kinetics, reduction_method='umap') +# dyn.tl.cell_velocities(rpe1_kinetics, enforce=True, vkey='velocity_S', ekey='M_s', basis='RFP_GFP') +# dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], basis='RFP_GFP') +# +# # for velocity gene-wise parameters +# save_path = './cell_wise_figures/' +# dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], vkey='velocity_S', ekey='M_s', basis='RFP_GFP', +# save_show_or_return='show', +# save_kwargs={"path": save_path, "prefix": 'icsp_vs_stream_gene-wise_alpha_beta', "dpi": 600, 'ext':'png'}) +# +# +# import scvelo as scv +# import matplotlib.pyplot as plt +# +# plt.rcParams['font.size'] = '7' +# dpi = 600 +# figsize = (6, 3) +# +# well_fitted = rpe1_kinetics.var['gamma_r2'] > 0 +# well_fitted_genes = well_fitted[well_fitted].index +# # well_fitted_genes = rpe1_kinetics.var['gamma_r2'].sort_values(ascending=False).index[:400] +# save_path = './cell_wise_figures/icsp_beta.png' +# ax = scv.pl.heatmap(rpe1_kinetics, +# var_names=well_fitted_genes, +# sortby='Cell_cycle_relativePos', +# col_color='cell_cycle_phase', +# n_convolve=100, +# layer='cell_wise_beta', +# figsize=(6, 3), +# show=False) +# # plt.savefig(save_path, dpi=dpi, figsize=figsize) +# plt.show() +# +# +# # dyn.configuration.set_figure_params(fontsize=6, dpi=300) +# # genes = ['HMGA2'] +# # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_T', +# # ekey='M_t', show_arrowed_spines=False, show_quiver=True, quiver_size=5, +# # figsize=(6 * 0.53, 4 * 0.53)) +# # genes = ['DCBLD2'] +# # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_T', +# # ekey='M_t', show_arrowed_spines=False, show_quiver=True, quiver_size=5, +# # figsize=(6 * 0.53, 4 * 0.53)) +# # genes = ['HIPK2'] +# # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_T', +# # ekey='M_t', show_arrowed_spines=False, show_quiver=True, quiver_size=5, +# # figsize=(6 * 0.53, 4 * 0.53)) +# # +# # # dyn.configuration.set_figure_params(fontsize=6, dpi=300) +# # # genes = ['HMGA2'] +# # # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_S', +# # # ekey='M_s', show_arrowed_spines=False, show_quiver=True, quiver_size=5, +# # # figsize=(6 * 0.53, 4 * 0.53)) +# # # genes = ['DCBLD2'] +# # # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_S', +# # # ekey='M_s', show_arrowed_spines=False, show_quiver=True, quiver_size=5, +# # # figsize=(6 * 0.53, 4 * 0.53)) +# # # genes = ['HIPK2'] +# # # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_S', +# # # ekey='M_s', show_arrowed_spines=False, show_quiver=True, quiver_size=5, +# # # figsize=(6 * 0.53, 4 * 0.53)) From 395c9cd053502adda8935e2a2b9c243234ecca6e Mon Sep 17 00:00:00 2001 From: sichao Date: Wed, 21 Jun 2023 14:43:39 -0400 Subject: [PATCH 20/31] rename functions --- dynamo/tools/dynamics.py | 124 +++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index fbf96d3c6..9beec01e4 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -302,7 +302,7 @@ def __init__(self, dynamics_kwargs: Dict): self.tkey = self.adata.uns["pp"]["tkey"] if dynamics_kwargs["tkey"] is None else dynamics_kwargs["tkey"] self.est_kwargs = dynamics_kwargs["est_kwargs"] - def _estimate_params_ss(self, subset_adata: AnnData, **est_params_args): + def estimate_params_ss(self, subset_adata: AnnData, **est_params_args): """Estimate velocity parameters with steady state mRNA assumption.""" if self.est_method.lower() == "auto": self.est_method = "gmm" if self.model.lower() == "stochastic" else "ols" @@ -347,7 +347,7 @@ def _estimate_params_ss(self, subset_adata: AnnData, **est_params_args): self.alpha, self.beta, self.gamma, self.eta, self.delta = self.est.parameters.values() - def _estimate_params_kin(self, cur_grp_i: int, cur_grp: str, subset_adata: AnnData, **est_params_args): + def estimate_params_kin(self, cur_grp_i: int, cur_grp: str, subset_adata: AnnData, **est_params_args): """Estimate velocity parameters with kinetic mRNA assumption.""" return_ntr = True if self.fraction_for_deg and self.experiment_type.lower() == "deg" else False @@ -419,9 +419,9 @@ def estimate_parameters(self, cur_grp_i: int, cur_grp: str, subset_adata: AnnDat """Wrapper to call corresponding parameters estimation functions according to assumptions. Override this in the subclass if the class doesn't use ss_estimation or kinetic_model to estimate.""" if self.assumption_mRNA.lower() == "ss" or (self.experiment_type.lower() in ["one-shot", "mix_std_stm"]): - self._estimate_params_ss(subset_adata=subset_adata, **est_params_args) + self.estimate_params_ss(subset_adata=subset_adata, **est_params_args) elif self.assumption_mRNA.lower() == "kinetic": - self._estimate_params_kin(cur_grp_i=cur_grp_i, cur_grp=cur_grp, subset_adata=subset_adata, **est_params_args) + self.estimate_params_kin(cur_grp_i=cur_grp_i, cur_grp=cur_grp, subset_adata=subset_adata, **est_params_args) else: main_warning("Not implemented yet.") @@ -506,7 +506,7 @@ def set_velocity( else: main_warning("Not implemented yet.") - def _calculate_velocity( + def calculate_vels( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -528,7 +528,7 @@ def _calculate_velocity( """ raise NotImplementedError("This method has not been implemented.") - def _calculate_vel_P( + def calculate_vel_P( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -565,8 +565,8 @@ def calculate_velocity(self, subset_adata: AnnData) -> Tuple: else: main_warning("Not implemented yet.") - vel_U, vel_S, vel_N, vel_T = self._calculate_velocity(vel=vel, U=U, S=S, N=N, T=T) - vel_P = self._calculate_vel_P(vel=vel, U=U, S=S, N=N, T=T) + vel_U, vel_S, vel_N, vel_T = self.calculate_vels(vel=vel, U=U, S=S, N=N, T=T) + vel_P = self.calculate_vel_P(vel=vel, U=U, S=S, N=N, T=T) return vel_U, vel_S, vel_N, vel_T, vel_P @@ -776,7 +776,7 @@ def estimate(self): class SplicedDynamics(BaseDynamics): """Dynamics models for RNA data only contain spliced RNA. This includes the conventional, generalized moments method (GMM) and negative binomial (NB) distribution method.""" - def _calculate_velocity( + def calculate_vels( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -794,7 +794,7 @@ def _calculate_velocity( class LabeledDynamics(BaseDynamics): """Dynamics model for metabolic labeling data.""" - def _calculate_vel_U( + def calculate_vel_U( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -805,7 +805,7 @@ def _calculate_vel_U( """Calculate unspliced velocity. All subclass should implement this method.""" raise NotImplementedError("This method has not been implemented.") - def _calculate_vel_S( + def calculate_vel_S( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -816,7 +816,7 @@ def _calculate_vel_S( """Calculate spliced velocity. All subclass should implement this method.""" raise NotImplementedError("This method has not been implemented.") - def _calculate_vel_N( + def calculate_vel_N( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -827,7 +827,7 @@ def _calculate_vel_N( """Calculate new velocity. All subclass should implement this method.""" raise NotImplementedError("This method has not been implemented.") - def _calculate_vel_T( + def calculate_vel_T( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -838,7 +838,7 @@ def _calculate_vel_T( """Calculate total velocity. All subclass should implement this method.""" raise NotImplementedError("This method has not been implemented.") - def _calculate_velocity( + def calculate_vels( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -849,18 +849,18 @@ def _calculate_velocity( """Implement the velocity calculation function for metabolic labeling data. Unsplcied and spliced velocity will be nan for data without splicing information.""" if self.has_splicing: - vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) - vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) else: vel_U, vel_S = np.nan, np.nan - vel_N = self._calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) - vel_T = self._calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) return vel_U, vel_S, vel_N, vel_T class OneShotDynamics(LabeledDynamics): """Dynamics model for the one shot experiment, where there is only one labeling time point.""" - def _calculate_vel_U( + def calculate_vel_U( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -870,7 +870,7 @@ def _calculate_vel_U( ) -> Union[ndarray, csr_matrix]: return vel.vel_u(U) - def _calculate_vel_S( + def calculate_vel_S( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -880,7 +880,7 @@ def _calculate_vel_S( ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) - def _calculate_vel_N( + def calculate_vel_N( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -890,7 +890,7 @@ def _calculate_vel_N( ) -> Union[ndarray, csr_matrix]: return vel.vel_u(N) - def _calculate_vel_T( + def calculate_vel_T( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -904,7 +904,7 @@ def _calculate_vel_T( class SSKineticsDynamics(LabeledDynamics): """Two-step dynamics model for the Kinetic experiment with steady state assumption, which relies on two consecutive linear regressions to estimate the degradation rate.""" - def _calculate_vel_U( + def calculate_vel_U( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -914,7 +914,7 @@ def _calculate_vel_U( ) -> Union[ndarray, csr_matrix]: return N.multiply(csr_matrix(self.gamma_ / self.Kc)) - csr_matrix(self.beta).multiply(U) - def _calculate_vel_S( + def calculate_vel_S( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -924,7 +924,7 @@ def _calculate_vel_S( ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) - def _calculate_vel_N( + def calculate_vel_N( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -934,7 +934,7 @@ def _calculate_vel_N( ) -> Union[ndarray, csr_matrix]: return (N - csr_matrix(self.Kc).multiply(N)).multiply(csr_matrix(self.gamma_ / self.Kc)) - def _calculate_vel_T( + def calculate_vel_T( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -944,7 +944,7 @@ def _calculate_vel_T( ) -> Union[ndarray, csr_matrix]: return (N - csr_matrix(self.Kc).multiply(T)).multiply(csr_matrix(self.gamma_ / self.Kc)) - def _calculate_velocity( + def calculate_vels( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -956,19 +956,19 @@ def _calculate_velocity( self.Kc = np.clip(self.gamma[:, None], 0, 1 - 1e-3) # S - U slope self.gamma_ = -(np.log(1 - self.Kc) / self.t[None, :]) # actual gamma if self.has_splicing: - vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) - vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) else: vel_U, vel_S = np.nan, np.nan - vel_N = self._calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) - vel_T = self._calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) return vel_U, vel_S, vel_N, vel_T class KineticsDynamics(LabeledDynamics): """Dynamic models for the kinetic experiment with kinetic assumption. This includes a kinetic two-step method and the direct method.""" - def _calculate_vel_U( + def calculate_vel_U( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -978,7 +978,7 @@ def _calculate_vel_U( ) -> Union[ndarray, csr_matrix]: return vel.vel_u(U) - def _calculate_vel_S( + def calculate_vel_S( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -988,7 +988,7 @@ def _calculate_vel_S( ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) - def _calculate_vel_N( + def calculate_vel_N( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -998,7 +998,7 @@ def _calculate_vel_N( ) -> Union[ndarray, csr_matrix]: return vel.vel_u(N) - def _calculate_vel_T( + def calculate_vel_T( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1008,7 +1008,7 @@ def _calculate_vel_T( ) -> Union[ndarray, csr_matrix]: return vel.vel_u(T) - def _calculate_velocity( + def calculate_vels( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1018,15 +1018,15 @@ def _calculate_velocity( ) -> Tuple: """Override the velocity calculation function to reset beta or alpha.""" if self.has_splicing: - vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) - vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) vel.parameters["beta"] = self.gamma else: vel_U, vel_S = np.nan, np.nan alpha_ = one_shot_alpha_matrix(N, self.gamma, self.t) vel.parameters["alpha"] = alpha_ - vel_N = self._calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) - vel_T = self._calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) return vel_U, vel_S, vel_N, vel_T @@ -1034,7 +1034,7 @@ class DegradationDynamics(LabeledDynamics): """Dynamics model for the degradation experiment. In degradation experiment, samples are chased after an extended 4sU (or other nucleotide analog) labeling period and the wash-out to observe the decay of the abundance of the (labeled) unspliced and spliced RNA decay over time.""" - def _calculate_vel_U( + def calculate_vel_U( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1044,7 +1044,7 @@ def _calculate_vel_U( ) -> Union[ndarray, csr_matrix]: return np.nan - def _calculate_vel_S( + def calculate_vel_S( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1054,7 +1054,7 @@ def _calculate_vel_S( ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) - def _calculate_vel_N( + def calculate_vel_N( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1064,7 +1064,7 @@ def _calculate_vel_N( ) -> Union[ndarray, csr_matrix]: return np.nan - def _calculate_vel_T( + def calculate_vel_T( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1077,7 +1077,7 @@ def _calculate_vel_T( class MixStdStmDynamics(LabeledDynamics): """Dynamics model for the mixed steady state and stimulation labeling (mix_std_stm) experiment.""" - def _calculate_vel_U( + def calculate_vel_U( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1087,7 +1087,7 @@ def _calculate_vel_U( ) -> Union[ndarray, csr_matrix]: return self.alpha1 - csr_matrix(self.beta[:, None]).multiply(U) - def _calculate_vel_S( + def calculate_vel_S( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1097,7 +1097,7 @@ def _calculate_vel_S( ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) - def _calculate_vel_N( + def calculate_vel_N( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1107,7 +1107,7 @@ def _calculate_vel_N( ) -> Union[ndarray, csr_matrix]: return self.alpha1 - csr_matrix(self.gamma[:, None]).multiply(self.u_new) - def _calculate_vel_T( + def calculate_vel_T( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1117,7 +1117,7 @@ def _calculate_vel_T( ) -> Union[ndarray, csr_matrix]: return self.alpha1 - csr_matrix(self.gamma[:, None]).multiply(T) - def _calculate_velocity( + def calculate_vels( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1134,8 +1134,8 @@ def _calculate_velocity( beta=self.beta, u1=N, ) - vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) - vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) else: u0, self.u_new, self.alpha1 = solve_alpha_2p_mat( t0=np.max(self.t) - self.t, @@ -1145,14 +1145,14 @@ def _calculate_velocity( u1=N, ) vel_U, vel_S = np.nan, np.nan - vel_N = self._calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) - vel_T = self._calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) return vel_U, vel_S, vel_N, vel_T class MixKineticsDynamics(LabeledDynamics): """Dynamics model for two mix experiment type: mix_kin_deg and mix_pulse_chase.""" - def _calculate_vel_U( + def calculate_vel_U( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1162,7 +1162,7 @@ def _calculate_vel_U( ) -> Union[ndarray, csr_matrix]: return vel.vel_u(U, repeat=True) - def _calculate_vel_S( + def calculate_vel_S( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1172,7 +1172,7 @@ def _calculate_vel_S( ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) - def _calculate_vel_N( + def calculate_vel_N( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1182,7 +1182,7 @@ def _calculate_vel_N( ) -> Union[ndarray, csr_matrix]: return vel.vel_u(N, repeat=True) - def _calculate_vel_T( + def calculate_vel_T( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1192,7 +1192,7 @@ def _calculate_vel_T( ) -> Union[ndarray, csr_matrix]: return vel.vel_u(T) if not self.has_splicing and self.NTR_vel else vel.vel_u(T, repeat=True) - def _calculate_velocity( + def calculate_vels( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1202,13 +1202,13 @@ def _calculate_velocity( ) -> Tuple: """Override the velocity calculation function to reset beta when the data contains splicing information.""" if self.has_splicing: - vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) - vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) vel.parameters["beta"] = self.gamma else: vel_U, vel_S = np.nan, np.nan - vel_N = self._calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) - vel_T = self._calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) return vel_U, vel_S, vel_N, vel_T From 3bfb7e8ffd0bd81f1f4fb3f9ad181b310fb2a8ef Mon Sep 17 00:00:00 2001 From: Qiangwei Peng <120767104+QiangweiPeng@users.noreply.github.com> Date: Thu, 6 Jul 2023 17:51:07 +0800 Subject: [PATCH 21/31] Delete scEU-seq_CellCycle_CSP_CSZIP.py delete test code --- scEU-seq_CellCycle_CSP_CSZIP.py | 240 -------------------------------- 1 file changed, 240 deletions(-) delete mode 100644 scEU-seq_CellCycle_CSP_CSZIP.py diff --git a/scEU-seq_CellCycle_CSP_CSZIP.py b/scEU-seq_CellCycle_CSP_CSZIP.py deleted file mode 100644 index f1a9ac691..000000000 --- a/scEU-seq_CellCycle_CSP_CSZIP.py +++ /dev/null @@ -1,240 +0,0 @@ -#!/usr/bin/env python -# -*- coding:utf-8 -*- - -import warnings - -warnings.filterwarnings('ignore') -import dynamo as dyn - -filename = './data/rpe1.h5ad' - -rpe1 = dyn.read(filename) - -dyn.convert2float(rpe1, ['Cell_cycle_possition', 'Cell_cycle_relativePos']) - -rpe1.obs.exp_type.value_counts() - -rpe1[rpe1.obs.exp_type == 'Chase', :].obs.time.value_counts() - -rpe1[rpe1.obs.exp_type == 'Pulse', :].obs.time.value_counts() - -rpe1_kinetics = rpe1[rpe1.obs.exp_type == 'Pulse', :] -rpe1_kinetics.obs['time'] = rpe1_kinetics.obs['time'].astype(str) -rpe1_kinetics.obs.loc[rpe1_kinetics.obs['time'] == 'dmso', 'time'] = -1 -rpe1_kinetics.obs['time'] = rpe1_kinetics.obs['time'].astype(float) -rpe1_kinetics = rpe1_kinetics[rpe1_kinetics.obs.time != -1, :] - -rpe1_kinetics.layers['new'], rpe1_kinetics.layers['total'] = rpe1_kinetics.layers['ul'] + rpe1_kinetics.layers['sl'], \ - rpe1_kinetics.layers['su'] + rpe1_kinetics.layers['sl'] + \ - rpe1_kinetics.layers['uu'] + rpe1_kinetics.layers['ul'] - -del rpe1_kinetics.layers['uu'], rpe1_kinetics.layers['ul'], rpe1_kinetics.layers['su'], rpe1_kinetics.layers['sl'] - -print(rpe1_kinetics.obs.time) - -rpe1_kinetics.obs.time = rpe1_kinetics.obs.time.astype('float') -rpe1_kinetics.obs.time = rpe1_kinetics.obs.time / 60 # convert minutes to hours - -# # velocity -# dyn.tl.recipe_kin_data(adata=rpe1_kinetics, -# keep_filtered_genes=True, -# keep_raw_layers=True, -# del_2nd_moments=False, -# tkey='time', -# n_top_genes=1000, -# ) - -from dynamo.tools.dynamics import dynamics_wrapper -from dynamo.tools.dimension_reduction import reduceDimension -from dynamo.tools.cell_velocities import cell_velocities -from dynamo.preprocessing.utils import ( - del_raw_layers, - detect_experiment_datatype, - reset_adata_X, -) -from dynamo.preprocessing import Preprocessor - -keep_filtered_cells = False -keep_filtered_genes = True -keep_raw_layers = True -del_2nd_moments = True -has_splicing, has_labeling, splicing_labeling = False, True, False -if has_splicing and has_labeling and splicing_labeling: - layers = ["X_new", "X_total", "X_uu", "X_ul", "X_su", "X_sl"] -elif has_labeling: - layers = ["X_new", "X_total"] - -# Preprocessing -preprocessor = Preprocessor(cell_cycle_score_enable=True) -preprocessor.config_monocle_recipe(rpe1_kinetics, n_top_genes=1000) -preprocessor.size_factor_kwargs.update( - { - "X_total_layers": False, - "splicing_total_layers": False, - } -) -preprocessor.normalize_by_cells_function_kwargs.update( - { - "X_total_layers": False, - "splicing_total_layers": False, - "keep_filtered": keep_filtered_genes, - "total_szfactor": "total_Size_Factor", - } -) -preprocessor.filter_cells_by_outliers_kwargs["keep_filtered"] = keep_filtered_cells -preprocessor.select_genes_kwargs["keep_filtered"] = keep_filtered_genes -if True: - reset_adata_X(rpe1_kinetics, experiment_type="kin", has_labeling=has_labeling, has_splicing=has_splicing) -preprocessor.preprocess_adata_monocle(adata=rpe1_kinetics, tkey='time', experiment_type="kin") -if not keep_raw_layers: - del_raw_layers(rpe1_kinetics) - -dynamics_wrapper(rpe1_kinetics, model="stochastic", est_method="storm-cszip", del_2nd_moments=del_2nd_moments) -reduceDimension(rpe1_kinetics, reduction_method='umap') -cell_velocities(rpe1_kinetics, basis='umap') - - -# dyn.tl.gene_wise_confidence(adata=rpe1_kinetics, -# group='cell_cycle_phase', -# lineage_dict={'M': 'G2-M'}, -# ekey='M_t', -# vkey='velocity_T' -# ) - -rpe1_kinetics.obsm['X_RFP_GFP'] = rpe1_kinetics.obs.loc[:, - ['RFP_log10_corrected', 'GFP_log10_corrected']].values.astype('float') - -dyn.tl.reduceDimension(rpe1_kinetics, reduction_method='umap') -dyn.tl.cell_velocities(rpe1_kinetics, enforce=True, vkey='velocity_T', ekey='M_t', basis='RFP_GFP') -dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], basis='RFP_GFP') - -# # # for velocity gene-wise parameters -# # import matplotlib.pyplot as plt -# # import scanpy as sc -# # sc.set_figure_params(scanpy=True, fontsize=6) -# # plt.rcParams['font.size'] = '6' -# # dyn.configuration.set_figure_params(dpi_save=600, figsize=(17 / 3 / 2.54, 17 / 3 / 2.54 * (4 / 6))) -# # -# # save_path = './cell_wise_figures/' -# # dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], vkey='velocity_T', ekey='M_t', basis='RFP_GFP', -# # save_show_or_return='show', -# # save_kwargs={"path": save_path, "prefix": 'cszip_stream_gene-wise_alpha', "dpi": 600, 'ext':'png'}) -# -# -# import scvelo as scv -# import matplotlib.pyplot as plt -# -# plt.rcParams['font.size'] = '7' -# dpi = 600 -# figsize = (6, 3) -# -# well_fitted = rpe1_kinetics.var['gamma_r2'] > 0 -# print(well_fitted) -# well_fitted_genes = well_fitted[well_fitted].index -# # well_fitted_genes = rpe1_kinetics.var['gamma_r2_raw'].sort_values(ascending=False).index[:1000] -# save_path = './cell_wise_figures/csp_alpha.png' -# # save_path = './cell_wise_figures/cszip_alpha_m_p_on.png' -# -# from dynamo.preprocessing.cell_cycle import get_cell_phase_genes -# -# cell_cycle_genes = get_cell_phase_genes(rpe1_kinetics, None) -# print(cell_cycle_genes) -# -# # yticklabels = [None]*len(well_fitted_genes) -# -# -# ax = scv.pl.heatmap(rpe1_kinetics, -# var_names=well_fitted_genes, -# sortby='Cell_cycle_relativePos', -# col_color='cell_cycle_phase', -# n_convolve=100, -# layer='cell_wise_alpha', -# figsize=(6, 3), -# show=False, -# colorbar=True, -# cbar_pos=(0.12, 0.4, 0.05, 0.18) -# # yticklabels=yticklabels -# ) -# # plt.colorbar() -# # plt.savefig(save_path, dpi=dpi, figsize=figsize) -# plt.show() -# -# # # genes = ['HMGA2', 'DCBLD2', 'HIPK2'] -# # dyn.configuration.set_figure_params(fontsize=6) -# # genes = ['HMGA2'] -# # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_T', -# # ekey='M_t', show_arrowed_spines=False, show_quiver=True, quiver_size=5, figsize=(6*0.53, 4*0.53)) -# # genes = ['DCBLD2'] -# # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_T', -# # ekey='M_t', show_arrowed_spines=False, show_quiver=True, quiver_size=5, figsize=(6*0.53, 4*0.53)) -# # genes = ['HIPK2'] -# # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_T', -# # ekey='M_t', show_arrowed_spines=False, show_quiver=True, quiver_size=5, figsize=(6*0.53, 4*0.53)) -# -# -# # dyn.vf.VectorField(rpe1_kinetics, basis='RFP_GFP', map_topography=True, M=100) -# # import matplotlib.pyplot as plt -# # import numpy as np -# # -# # fig, ax = plt.subplots() -# # ax = dyn.pl.topography(rpe1_kinetics, basis='RFP_GFP', color='Cell_cycle_relativePos', ax=ax, -# # save_show_or_return='show', fps_basis='RFP_GFP') -# -# # # dyn.tl.cell_velocities(rpe1_kinetics, basis='pca') -# # # dyn.vf.VectorField(rpe1_kinetics, basis='pca', M=100) -# # # dyn.pp.top_pca_genes(rpe1_kinetics, n_top_genes=100) -# # # top_pca_genes = rpe1_kinetics.var.index[rpe1_kinetics.var.top_pca_genes] -# # # top_pca_genes = ["CDK4", "CDK6", "CDK2", "SKP2", "WEE1", "CDK1", "CDC20"] + list(top_pca_genes) -# # -# # dyn.tl.cell_velocities(rpe1_kinetics, basis='pca') -# # dyn.vf.VectorField(rpe1_kinetics, basis='pca', M=100) -# # # top_pca_genes = rpe1_kinetics[:, rpe1_kinetics.var['use_for_transition']].var.index.tolist() -# # # top_pca_genes = ["CDK4", "CDK6", "CDK2", "SKP2", "WEE1", "CDK1", "CDC20"] + list(top_pca_genes) -# # top_pca_genes = ["CDK4", "CDK6", "CDK2", "SKP2", "WEE1", "CDK1", "CDC20"] -# # -# # dyn.vf.jacobian(rpe1_kinetics, regulators=top_pca_genes, effectors=top_pca_genes) -# # dyn.pl.jacobian( -# # rpe1_kinetics, -# # regulators=top_pca_genes, -# # effectors=top_pca_genes, -# # basis="RFP_GFP", -# # ) -# # -# # divergence_rank = dyn.vf.rank_divergence_genes(rpe1_kinetics, groups='cell_cycle_phase') -# # dyn.vf.rank_jacobian_genes(rpe1_kinetics, groups='cell_cycle_phase') -# # -# # full_reg_rank = dyn.vf.rank_jacobian_genes(rpe1_kinetics, -# # groups='cell_cycle_phase', -# # mode="full_reg", -# # abs=True, -# # output_values=True, -# # return_df=True) -# # full_eff_rank = dyn.vf.rank_jacobian_genes(rpe1_kinetics, -# # groups='cell_cycle_phase', -# # mode='full_eff', -# # abs=True, -# # exclude_diagonal=True, -# # output_values=True, -# # return_df=True) -# # # print(full_reg_rank['G2-M']) -# # # print(full_reg_rank['G2-M'].head(2)) -# # -# # # unknown_cell_type_regulators = ["E2F", "Cdk4", "Cdk6", "pRB", "pRBp", "pRBpp", "Cdk2", -# # # "Skp2", "Wee1", "Cdh1", "Cdc25", "Cdk1", "Cdc20"] -# # unknown_cell_type_regulators = ["CDK4", "CDK6", "CDK2", "SKP2", "WEE1", "CDK1", "CDC20"] -# # -# # edges_list = dyn.vf.build_network_per_cluster(rpe1_kinetics, -# # cluster='cell_cycle_phase', -# # cluster_names=None, -# # full_reg_rank=full_reg_rank, -# # full_eff_rank=full_eff_rank, -# # genes=np.unique(unknown_cell_type_regulators), -# # n_top_genes=100) -# # -# # import networkx as nx -# # -# # print(edges_list) -# # network = nx.from_pandas_edgelist(edges_list['G1-S'], 'regulator', 'target', edge_attr='weight', -# # create_using=nx.DiGraph()) -# # ax = dyn.pl.arcPlot(rpe1_kinetics, cluster='cell_cycle_phase', cluster_name="G1-S", edges_list=None, -# # network=network, color="M_t", save_show_or_return='show') From 25a283143d6978639dd8576f3047f841262f0f62 Mon Sep 17 00:00:00 2001 From: Qiangwei Peng <120767104+QiangweiPeng@users.noreply.github.com> Date: Thu, 6 Jul 2023 17:51:39 +0800 Subject: [PATCH 22/31] Delete cell_cycle_CSPss.py delete test code --- cell_cycle_CSPss.py | 124 -------------------------------------------- 1 file changed, 124 deletions(-) delete mode 100644 cell_cycle_CSPss.py diff --git a/cell_cycle_CSPss.py b/cell_cycle_CSPss.py deleted file mode 100644 index 8fa931025..000000000 --- a/cell_cycle_CSPss.py +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env python -# -*- coding:utf-8 -*- -import warnings - -warnings.filterwarnings('ignore') -import dynamo as dyn - -filename = './data/rpe1.h5ad' - -rpe1 = dyn.read(filename) - -dyn.convert2float(rpe1, ['Cell_cycle_possition', 'Cell_cycle_relativePos']) - -rpe1.obs.exp_type.value_counts() - -rpe1[rpe1.obs.exp_type == 'Chase', :].obs.time.value_counts() - -rpe1[rpe1.obs.exp_type == 'Pulse', :].obs.time.value_counts() - -rpe1_kinetics = rpe1[rpe1.obs.exp_type == 'Pulse', :] -rpe1_kinetics.obs['time'] = rpe1_kinetics.obs['time'].astype(str) -rpe1_kinetics.obs.loc[rpe1_kinetics.obs['time'] == 'dmso', 'time'] = -1 -rpe1_kinetics.obs['time'] = rpe1_kinetics.obs['time'].astype(float) -rpe1_kinetics = rpe1_kinetics[rpe1_kinetics.obs.time != -1, :] -# rpe1_kinetics = rpe1_kinetics[rpe1_kinetics.obs.time > 29, :] -# rpe1_kinetics = rpe1_kinetics[rpe1_kinetics.obs.time < 31, :] -rpe1_kinetics = rpe1_kinetics[rpe1_kinetics.obs.time < 16, :] - -rpe1_kinetics.layers['new'], rpe1_kinetics.layers['total'] = rpe1_kinetics.layers['ul'] + rpe1_kinetics.layers['sl'], \ - rpe1_kinetics.layers['su'] + rpe1_kinetics.layers['sl'] + \ - rpe1_kinetics.layers['uu'] + rpe1_kinetics.layers['ul'] - -del rpe1_kinetics.layers['uu'], rpe1_kinetics.layers['ul'], rpe1_kinetics.layers['su'], rpe1_kinetics.layers['sl'] - -print(rpe1_kinetics.obs.time) - -rpe1_kinetics.obs.time = rpe1_kinetics.obs.time.astype('float') -rpe1_kinetics.obs.time = rpe1_kinetics.obs.time / 60 # convert minutes to hours - -# dyn.pp.recipe_monocle( -# rpe1_kinetics, -# tkey="time", -# experiment_type="one-shot", -# # experiment_type="kin", -# n_top_genes=1000, -# total_layers=False, -# keep_raw_layers=True, -# # feature_selection_layer="new", -# ) -# dyn.tl.dynamics(rpe1_kinetics, -# model="deterministic", -# # est_method='CSP4ML_CSPss' -# ) - -from dynamo.tools.dynamics import dynamics_wrapper -from dynamo.tools.dimension_reduction import reduceDimension -from dynamo.tools.cell_velocities import cell_velocities -from dynamo.preprocessing.utils import ( - del_raw_layers, - detect_experiment_datatype, - reset_adata_X, -) -from dynamo.preprocessing import Preprocessor - -keep_filtered_cells = False -keep_filtered_genes = True -keep_raw_layers = True -del_2nd_moments = True -has_splicing, has_labeling, splicing_labeling = False, True, False -if has_splicing and has_labeling and splicing_labeling: - layers = ["X_new", "X_total", "X_uu", "X_ul", "X_su", "X_sl"] -elif has_labeling: - layers = ["X_new", "X_total"] - -# Preprocessing -preprocessor = Preprocessor(cell_cycle_score_enable=True) -preprocessor.config_monocle_recipe(rpe1_kinetics, n_top_genes=1000) -preprocessor.size_factor_kwargs.update( - { - "X_total_layers": False, - "splicing_total_layers": False, - } -) -preprocessor.normalize_by_cells_function_kwargs.update( - { - "X_total_layers": False, - "splicing_total_layers": False, - "keep_filtered": keep_filtered_genes, - "total_szfactor": "total_Size_Factor", - } -) -preprocessor.filter_cells_by_outliers_kwargs["keep_filtered"] = keep_filtered_cells -preprocessor.select_genes_kwargs["keep_filtered"] = keep_filtered_genes -if True: - reset_adata_X(rpe1_kinetics, experiment_type="one-shot", has_labeling=has_labeling, has_splicing=has_splicing) -preprocessor.preprocess_adata_monocle(adata=rpe1_kinetics, tkey='time', experiment_type="one-shot") -if not keep_raw_layers: - del_raw_layers(rpe1_kinetics) - -from dynamo.tools.dynamics import dynamics_wrapper -dynamics_wrapper( - rpe1_kinetics, - model="stochastic", - del_2nd_moments=del_2nd_moments, - assumption_mRNA='ss', - one_shot_method='storm-csp', -) -reduceDimension(rpe1_kinetics, reduction_method='umap') -cell_velocities(rpe1_kinetics, enforce=True, vkey='velocity_T', ekey='M_t', basis='umap') - - -rpe1_kinetics.obsm['X_RFP_GFP'] = rpe1_kinetics.obs.loc[:, - ['RFP_log10_corrected', 'GFP_log10_corrected']].values.astype('float') - -dyn.tl.reduceDimension(rpe1_kinetics, reduction_method='umap') -dyn.tl.cell_velocities(rpe1_kinetics, enforce=True, vkey='velocity_T', ekey='M_t', basis='RFP_GFP') -dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], basis='RFP_GFP', save_show_or_return='show') - - -# path = './figures_new/figure4/' -# figsize = (6, 4) -# dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], basis='RFP_GFP', save_show_or_return='show', -# save_kwargs={'prefix': 'cell_cycle_rfp_gfp_15mins_dynamo', 'ext': 'png', -# "bbox_inches": None, 'dpi': 600, 'path': path}, figsize=figsize) From 561d73788aa73a06b180fd3cbae017f9728b2456 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 7 Jul 2023 11:13:59 -0400 Subject: [PATCH 23/31] reformat tab space --- dynamo/estimation/csc/velocity.py | 132 ++++++++++---------- dynamo/tools/dynamics.py | 198 +++++++++++++++--------------- 2 files changed, 165 insertions(+), 165 deletions(-) diff --git a/dynamo/estimation/csc/velocity.py b/dynamo/estimation/csc/velocity.py index 35c79c702..e2d292f06 100755 --- a/dynamo/estimation/csc/velocity.py +++ b/dynamo/estimation/csc/velocity.py @@ -45,14 +45,14 @@ class Velocity: """ def __init__( - self, - alpha=None, - beta=None, - gamma=None, - eta=None, - delta=None, - t=None, - estimation=None, + self, + alpha=None, + beta=None, + gamma=None, + eta=None, + delta=None, + t=None, + estimation=None, ): if estimation is not None: self.parameters = {} @@ -400,28 +400,28 @@ class ss_estimation: """ def __init__( - self, - U=None, - Ul=None, - S=None, - Sl=None, - P=None, - US=None, - S2=None, - NewCounts=None, - TotalCounts=None, - NewSmoothCSP=None, - conn=None, - t=None, - ind_for_proteins=None, - model="stochastic", - est_method="gmm", - experiment_type="deg", - assumption_mRNA=None, - assumption_protein="ss", - concat_data=True, - cores=1, - **kwargs + self, + U=None, + Ul=None, + S=None, + Sl=None, + P=None, + US=None, + S2=None, + NewCounts=None, + TotalCounts=None, + NewSmoothCSP=None, + conn=None, + t=None, + ind_for_proteins=None, + model="stochastic", + est_method="gmm", + experiment_type="deg", + assumption_mRNA=None, + assumption_protein="ss", + concat_data=True, + cores=1, + **kwargs ): self.t = t @@ -478,12 +478,12 @@ def __init__( self.ind_for_proteins = ind_for_proteins def fit( - self, - intercept=False, - perc_left=None, - perc_right=5, - clusters=None, - one_shot_method="combined", + self, + intercept=False, + perc_left=None, + perc_right=5, + clusters=None, + one_shot_method="combined", ): """Fit the input data to estimate all or a subset of the parameters @@ -981,7 +981,7 @@ def fit( np.zeros(n_genes), ) for i in range( - n_genes + n_genes ): # can also use the two extreme time points and apply sci-fate like approach. S, U = ( self.data["su"][i] + self.data["sl"][i], @@ -1195,8 +1195,8 @@ def fit( ) if cores == 1: for i in tqdm( - range(n_genes), - desc="estimating beta and alpha for one-shot experiment", + range(n_genes), + desc="estimating beta and alpha for one-shot experiment", ): ( k[i], @@ -1273,8 +1273,8 @@ def fit( ) if cores == 1: for i in tqdm( - range(n_genes), - desc="estimating gamma and alpha for one-shot experiment", + range(n_genes), + desc="estimating gamma and alpha for one-shot experiment", ): ( k[i], @@ -1482,13 +1482,13 @@ def fit( np.zeros(n_genes), ) for i in tqdm( - range(n_genes), desc="solving gamma/beta" + range(n_genes), desc="solving gamma/beta" ): # can also use the two extreme time points and apply sci-fate like approach. tmp = ( - self.data["uu"][i, self.t == t_max] - + self.data["ul"][i, self.t == t_max] - + self.data["su"][i, self.t == t_max] - + self.data["sl"][i, self.t == t_max] + self.data["uu"][i, self.t == t_max] + + self.data["ul"][i, self.t == t_max] + + self.data["su"][i, self.t == t_max] + + self.data["sl"][i, self.t == t_max] ) total[i] = np.mean(tmp) gamma[i] = solve_gamma( @@ -1664,15 +1664,15 @@ def fit_gamma_steady_state(self, u, s, intercept=True, perc_left=None, perc_righ return k, b, r2, all_r2, logLL, all_logLL def fit_gamma_storm_csp( - self, - new_counts, - total_counts, - new_smooth, - total_smooth, - t_uniq, - perc_left=None, - perc_right=50, - normalize=True, + self, + new_counts, + total_counts, + new_smooth, + total_smooth, + t_uniq, + perc_left=None, + perc_right=50, + normalize=True, ): """Estimate gamma using Storm's CSP model based on the steady state assumption. @@ -1706,15 +1706,15 @@ def fit_gamma_storm_csp( return gamma, gamma_r2, k def fit_gamma_stochastic( - self, - est_method, - u, - s, - us, - ss, - perc_left=None, - perc_right=5, - normalize=True, + self, + est_method, + u, + s, + us, + ss, + perc_left=None, + perc_right=5, + normalize=True, ): """Estimate gamma using GMM (generalized method of moments) or negbin distrubtion based on the steady state assumption. @@ -1896,8 +1896,8 @@ def solve_alpha_mix_std_stm(self, t, ul, beta, clusters=None, alpha_time_depende alpha_std, alpha_stm = alpha_std_ini, np.zeros((ul.shape[0], len(t_uniq))) alpha_stm[:, 0] = alpha_std_ini # 0 stimulation point is the steady state transcription for i in tqdm( - range(ul.shape[0]), - desc="solving steady state alpha and induction alpha", + range(ul.shape[0]), + desc="solving steady state alpha and induction alpha", ): l = ul[i].A.flatten() if issparse(ul) else ul[i] for t_ind in np.arange(1, len(t_uniq)): diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 02cce17f5..ed7d84200 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -455,17 +455,17 @@ def estimate_parameters(self, cur_grp_i: int, cur_grp: str, subset_adata: AnnDat main_warning("Not implemented yet.") def set_velocity( - self, - vel_U: Union[ndarray, csr_matrix], - vel_S: Union[ndarray, csr_matrix], - vel_N: Union[ndarray, csr_matrix], - vel_T: Union[ndarray, csr_matrix], - vel_P: Union[ndarray, csr_matrix], - cur_grp: int, - cur_cells_bools: ndarray, - valid_bools_: ndarray, - kin_param_pre: str, - **set_velo_args, + self, + vel_U: Union[ndarray, csr_matrix], + vel_S: Union[ndarray, csr_matrix], + vel_N: Union[ndarray, csr_matrix], + vel_T: Union[ndarray, csr_matrix], + vel_P: Union[ndarray, csr_matrix], + cur_grp: int, + cur_cells_bools: ndarray, + valid_bools_: ndarray, + kin_param_pre: str, + **set_velo_args, ): """Save the calculated parameters and velocity to anndata. Override this in the subclass if the class has a different assumption.""" @@ -644,12 +644,12 @@ def _smooth(self, valid_bools: ndarray): ) def _sanity_check( - self, - valid_bools: ndarray, - valid_bools_: ndarray, - gene_num: int, - subset_adata: AnnData, - kin_param_pre: str, + self, + valid_bools: ndarray, + valid_bools_: ndarray, + gene_num: int, + subset_adata: AnnData, + kin_param_pre: str, ) -> Tuple: """Perform sanity check by checking the slope for kinetic or degradation metabolic labeling experiments.""" indices_valid_bools = np.where(valid_bools)[0] @@ -1066,32 +1066,32 @@ class KineticsStormDynamics(LabeledDynamics): expression states, transcription in the active state, and mRNA degradation.""" def _calculate_vel_U( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_u(U) def _calculate_vel_S( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) def _calculate_vel_N( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: if self.est_method == 'storm-icsp': return vel.vel_u(self.Sl) @@ -1099,12 +1099,12 @@ def _calculate_vel_N( return vel.vel_u(N) def _calculate_vel_T( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Union[ndarray, csr_matrix]: if self.est_method == 'storm-icsp': return vel.vel_u(S) @@ -1112,12 +1112,12 @@ def _calculate_vel_T( return vel.vel_u(T) def _calculate_velocity( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], ) -> Tuple: """Override the velocity calculation function to reset beta or alpha.""" if self.has_splicing: @@ -1315,26 +1315,26 @@ def calculate_vels( # TODO: rename this later def dynamics_wrapper( - adata: AnnData, - filter_gene_mode: Literal["final", "basic", "no"] = "final", - use_smoothed: bool = True, - assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", - assumption_protein: Literal["ss"] = "ss", - model: Literal["auto", "deterministic", "stochastic"] = "auto", - est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", - NTR_vel: bool = False, - group: Optional[str] = None, - protein_names: Optional[List[str]] = None, - concat_data: bool = False, - log_unnormalized: bool = True, - one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", - fraction_for_deg: bool = False, - re_smooth: bool = False, - sanity_check: bool = False, - del_2nd_moments: Optional[bool] = None, - cores: int = 1, - tkey: str = None, - **est_kwargs, + adata: AnnData, + filter_gene_mode: Literal["final", "basic", "no"] = "final", + use_smoothed: bool = True, + assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", + assumption_protein: Literal["ss"] = "ss", + model: Literal["auto", "deterministic", "stochastic"] = "auto", + est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", + NTR_vel: bool = False, + group: Optional[str] = None, + protein_names: Optional[List[str]] = None, + concat_data: bool = False, + log_unnormalized: bool = True, + one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", + fraction_for_deg: bool = False, + re_smooth: bool = False, + sanity_check: bool = False, + del_2nd_moments: Optional[bool] = None, + cores: int = 1, + tkey: str = None, + **est_kwargs, ) -> AnnData: """Predict the model and assumption if they are set as auto. Run corresponding Dynamics methods according to the experiment type. More information can be found in the class BaseDynamics.""" @@ -1449,26 +1449,26 @@ def dynamics_wrapper( # incorporate the model selection code soon def dynamics( - adata: AnnData, - filter_gene_mode: Literal["final", "basic", "no"] = "final", - use_smoothed: bool = True, - assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", - assumption_protein: Literal["ss"] = "ss", - model: Literal["auto", "deterministic", "stochastic"] = "auto", - est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", - NTR_vel: bool = False, - group: Optional[str] = None, - protein_names: Optional[List[str]] = None, - concat_data: bool = False, - log_unnormalized: bool = True, - one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", - fraction_for_deg: bool = False, - re_smooth: bool = False, - sanity_check: bool = False, - del_2nd_moments: Optional[bool] = None, - cores: int = 1, - tkey: str = None, - **est_kwargs, + adata: AnnData, + filter_gene_mode: Literal["final", "basic", "no"] = "final", + use_smoothed: bool = True, + assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", + assumption_protein: Literal["ss"] = "ss", + model: Literal["auto", "deterministic", "stochastic"] = "auto", + est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", + NTR_vel: bool = False, + group: Optional[str] = None, + protein_names: Optional[List[str]] = None, + concat_data: bool = False, + log_unnormalized: bool = True, + one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", + fraction_for_deg: bool = False, + re_smooth: bool = False, + sanity_check: bool = False, + del_2nd_moments: Optional[bool] = None, + cores: int = 1, + tkey: str = None, + **est_kwargs, ) -> AnnData: """Inclusive model of expression dynamics considers splicing, metabolic labeling and protein translation. @@ -2370,18 +2370,18 @@ def dynamics( def kinetic_model( - subset_adata: AnnData, - tkey: str, - model: Literal["auto", "deterministic", "stochastic"], - est_method: Literal["twostep", "direct", "storm-csp", "storm-cszip", "storm-icsp"], - experiment_type: str, - has_splicing: bool, - splicing_labeling: bool, - has_switch: bool, - param_rngs: Dict[str, List[int]], - data_type: Literal["smoothed", "sfs"] = "sfs", - return_ntr: bool = False, - **est_kwargs, + subset_adata: AnnData, + tkey: str, + model: Literal["auto", "deterministic", "stochastic"], + est_method: Literal["twostep", "direct", "storm-csp", "storm-cszip", "storm-icsp"], + experiment_type: str, + has_splicing: bool, + splicing_labeling: bool, + has_switch: bool, + param_rngs: Dict[str, List[int]], + data_type: Literal["smoothed", "sfs"] = "sfs", + return_ntr: bool = False, + **est_kwargs, ) -> Tuple[ Union[Dict[str, Any], pd.DataFrame], np.ndarray, From 6551cde23afaa545871efb52b500bee61047fe90 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 7 Jul 2023 11:24:20 -0400 Subject: [PATCH 24/31] rename storm func --- dynamo/tools/dynamics.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index ed7d84200..2d3d39ef5 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -1065,7 +1065,7 @@ class KineticsStormDynamics(LabeledDynamics): transcription, splicing, and spliced mRNA degradation. And in Model 3, we considered the switching of gene expression states, transcription in the active state, and mRNA degradation.""" - def _calculate_vel_U( + def calculate_vel_U( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1075,7 +1075,7 @@ def _calculate_vel_U( ) -> Union[ndarray, csr_matrix]: return vel.vel_u(U) - def _calculate_vel_S( + def calculate_vel_S( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1085,7 +1085,7 @@ def _calculate_vel_S( ) -> Union[ndarray, csr_matrix]: return vel.vel_s(U, S) - def _calculate_vel_N( + def calculate_vel_N( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1098,7 +1098,7 @@ def _calculate_vel_N( else: return vel.vel_u(N) - def _calculate_vel_T( + def calculate_vel_T( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1111,7 +1111,7 @@ def _calculate_vel_T( else: return vel.vel_u(T) - def _calculate_velocity( + def calculate_velocity( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1121,13 +1121,13 @@ def _calculate_velocity( ) -> Tuple: """Override the velocity calculation function to reset beta or alpha.""" if self.has_splicing: - vel_U = self._calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) - vel_S = self._calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) vel.parameters["beta"] = self.gamma else: vel_U, vel_S = np.nan, np.nan - vel_N = self._calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) - vel_T = self._calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) return vel_U, vel_S, vel_N, vel_T From ef9682a63e0533eb7521f8739c3da7a0c3750d52 Mon Sep 17 00:00:00 2001 From: sichao Date: Fri, 7 Jul 2023 12:02:45 -0400 Subject: [PATCH 25/31] reformat storm --- dynamo/estimation/tsc/storm.py | 91 +++++++++++++++++----------------- 1 file changed, 46 insertions(+), 45 deletions(-) diff --git a/dynamo/estimation/tsc/storm.py b/dynamo/estimation/tsc/storm.py index 8c038d8af..572c24737 100644 --- a/dynamo/estimation/tsc/storm.py +++ b/dynamo/estimation/tsc/storm.py @@ -17,15 +17,15 @@ def mle_cell_specific_poisson_ss( - R: Union[np.ndarray, csr_matrix], - N: Union[np.ndarray, csr_matrix], - time: np.ndarray, - gamma_init: np.ndarray, - cell_total: np.ndarray, - Total_smoothed, - New_smoothed, + R: Union[np.ndarray, csr_matrix], + N: Union[np.ndarray, csr_matrix], + time: np.ndarray, + gamma_init: np.ndarray, + cell_total: np.ndarray, + Total_smoothed, + New_smoothed, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: - """"Infer parameters based on the cell specific Poisson model using maximum likelihood estimation under the + """Infer parameters based on the cell specific Poisson model using maximum likelihood estimation under the steady-state assumption Args: @@ -145,12 +145,12 @@ def saturated_loss_func_ss(): def mle_cell_specific_poisson( - N: Union[np.ndarray, csr_matrix], - time: np.ndarray, - gamma_init: np.ndarray, - cell_total: np.ndarray + N: Union[np.ndarray, csr_matrix], + time: np.ndarray, + gamma_init: np.ndarray, + cell_total: np.ndarray, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: - """"Infer parameters based on cell specific Poisson distributions using maximum likelihood estimation + """Infer parameters based on cell specific Poisson distributions using maximum likelihood estimation Args: N: The number of new mRNA counts for each gene in each cell. shape: (n_var, n_obs). @@ -234,14 +234,14 @@ def saturated_loss_func(): def mle_cell_specific_zero_inflated_poisson( - N: Union[np.ndarray, csr_matrix], - time: np.ndarray, - gamma_init: np.ndarray, - cell_total: np.ndarray + N: Union[np.ndarray, csr_matrix], + time: np.ndarray, + gamma_init: np.ndarray, + cell_total: np.ndarray, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: - """"Infer parameters based on cell specific zero-inflated Poisson distributions using maximum likelihood estimation + """Infer parameters based on cell specific zero-inflated Poisson distributions using maximum likelihood estimation - Args: + Args: N: The number of new mRNA counts for each gene in each cell. shape: (n_var, n_obs). time: The time point of each cell. shape: (n_obs,). gamma_init: The initial value of gamma. shape: (n_var,). @@ -354,16 +354,16 @@ def saturated_loss_func(): def mle_independent_cell_specific_poisson( - UL: Union[np.ndarray, csr_matrix], - SL: Union[np.ndarray, csr_matrix], - time: np.ndarray, - gamma_init: np.ndarray, - beta_init: np.ndarray, - cell_total: np.ndarray, - Total_smoothed: Union[np.ndarray, csr_matrix], - S_smoothed: Union[np.ndarray, csr_matrix] + UL: Union[np.ndarray, csr_matrix], + SL: Union[np.ndarray, csr_matrix], + time: np.ndarray, + gamma_init: np.ndarray, + beta_init: np.ndarray, + cell_total: np.ndarray, + Total_smoothed: Union[np.ndarray, csr_matrix], + S_smoothed: Union[np.ndarray, csr_matrix], ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: - """"Infer parameters based on independent cell specific Poisson distributions using maximum likelihood estimation + """Infer parameters based on independent cell specific Poisson distributions using maximum likelihood estimation Args: UL: The number of unspliced labeled mRNA counts for each gene in each cell. shape: (n_var, n_obs). @@ -480,13 +480,13 @@ def saturated_loss_func(): def cell_specific_alpha_beta( - UL_smoothed_CSP: Union[np.ndarray, csr_matrix], - SL_smoothed_CSP: Union[np.ndarray, csr_matrix], - time: np.ndarray, - gamma_init: np.ndarray, - beta_init: np.ndarray + UL_smoothed_CSP: Union[np.ndarray, csr_matrix], + SL_smoothed_CSP: Union[np.ndarray, csr_matrix], + time: np.ndarray, + gamma_init: np.ndarray, + beta_init: np.ndarray, ) -> Tuple[csr_matrix, csr_matrix]: - """"Infer cell specific transcription rate and splicing rate based on ICSP model + """Infer cell specific transcription rate and splicing rate based on ICSP model Args: UL_smoothed_CSP: The number of unspliced labeled mRNA expression after smoothing based on CSP type model for @@ -537,12 +537,13 @@ def solve_beta_func(beta_j): def visualize_CSP_loss_landscape( - adata: AnnData, - gene_name_list: list, - figsize: tuple = (3, 3), - dpi: int = 75, - save_name: Optional[str] = None): - """"Draw the landscape of CSP model-based loss function for the given genes. + adata: AnnData, + gene_name_list: list, + figsize: tuple = (3, 3), + dpi: int = 75, + save_name: Optional[str] = None, +): + """Draw the landscape of CSP model-based loss function for the given genes. Args: adata: class:`~anndata.AnnData` @@ -674,8 +675,8 @@ def _plot_landscape(X, Y, Z, gamma, alpha_div_gamma_dldalpha_eq0, alpha_div_gamm def robustness_measure_CSP( - adata: AnnData, - gene_name_list: list, + adata: AnnData, + gene_name_list: list, ) -> np.ndarray: """Calculate the robustness measure based on CSP model inference of the given genes @@ -697,9 +698,9 @@ def robustness_measure_CSP( def calculate_robustness_measure_CSP( - N: Union[np.ndarray, csr_matrix], - time: np.ndarray, - cell_total: np.ndarray + N: Union[np.ndarray, csr_matrix], + time: np.ndarray, + cell_total: np.ndarray, ) -> np.ndarray: """Calculate the robustness measure based on CSP model inference From ea31eecc0e893d9cf87ab5d20d2b2807e67c804c Mon Sep 17 00:00:00 2001 From: sichao Date: Tue, 25 Jul 2023 16:12:50 -0400 Subject: [PATCH 26/31] create kinetic params est func --- dynamo/tools/dynamics.py | 1004 +++++++++++++++++++++++++++++++++++++- 1 file changed, 1003 insertions(+), 1 deletion(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 2d3d39ef5..5dbf54737 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -304,6 +304,9 @@ def __init__(self, dynamics_kwargs: Dict): self.tkey = self.adata.uns["pp"]["tkey"] if dynamics_kwargs["tkey"] is None else dynamics_kwargs["tkey"] self.est_kwargs = dynamics_kwargs["est_kwargs"] + def estimate_params_utils(self, params_est_kwargs): + pass + def estimate_params_ss(self, subset_adata: AnnData, **est_params_args): """Estimate velocity parameters with steady state mRNA assumption.""" if self.est_method.lower() == "auto": @@ -1059,11 +1062,279 @@ def calculate_vels( return vel_U, vel_S, vel_N, vel_T +class TwoStepKineticsDynamics(KineticsDynamics): + def estimate_params_utils(self, params_est_kwargs): + ( + subset_adata, + data_type, + return_ntr, + ) = params_est_kwargs["subset_adata"], params_est_kwargs["data_type"], params_est_kwargs["return_ntr"] + time = subset_adata.obs[self.tkey].astype("float").values + if self.has_splicing: + layers = ( + ["M_u", "M_s", "M_t", "M_n"] + if ("M_u" in subset_adata.layers.keys() and data_type == "smoothed") + else ["X_u", "X_s", "X_t", "X_n"] + ) + U, S, Total, New = ( + subset_adata.layers[layers[0]].T, + subset_adata.layers[layers[1]].T, + subset_adata.layers[layers[2]].T, + subset_adata.layers[layers[3]].T, + ) + US, S2 = ( + subset_adata.layers["M_us"].T, + subset_adata.layers["M_ss"].T, + ) + # gamma, gamma_r2 = lin_reg_gamma_synthesis(U, Ul, time, perc_right=100) + ( + gamma_k, + gamma_b, + gamma_all_r2, + gamma_all_logLL, + ) = fit_slope_stochastic(S, U, US, S2, perc_left=None, perc_right=100) + ( + gamma, + gamma_r2, + X_data, + mean_R2, + K_fit, + ) = lin_reg_gamma_synthesis(Total, New, time, perc_right=100) + + k = 1 - np.exp(-gamma[:, None] * time[None, :]) + beta = gamma / gamma_k # gamma_k = gamma / beta + + Estm_df = { + "alpha": csr_matrix(gamma[:, None]).multiply(New).multiply(1 / k), + "beta": beta, + "gamma_k": gamma_k, + "gamma_b": gamma_b, + "gamma_k_r2": gamma_all_r2, + "gamma_logLL": gamma_all_logLL, + "gamma": gamma, + "gamma_r2": gamma_r2, + "mean_R2": mean_R2, + } + half_life = np.log(2) / gamma + cost, logLL, _param_ranges, X_data, X_fit_data = ( + None, + None, + None, + X_data, + K_fit, + ) + + return ( + Estm_df, + half_life, + cost, + logLL, + _param_ranges, + X_data, + X_fit_data, + ) + else: + layers = ( + ["M_t", "M_n"] + if ("M_t" in subset_adata.layers.keys() and data_type == "smoothed") + else ["X_t", "X_n"] + ) + Total, New = ( + subset_adata.layers[layers[0]].T, + subset_adata.layers[layers[1]].T, + ) + ( + gamma, + gamma_r2, + X_data, + mean_R2, + K_fit, + ) = lin_reg_gamma_synthesis(Total, New, time, perc_right=100) + + k = 1 - np.exp(-gamma[:, None] * time[None, :]) + Estm_df = { + "alpha": csr_matrix(gamma[:, None]).multiply(New).multiply(1 / k), + "gamma": gamma, + "gamma_k": gamma, # required for phase_potrait + "gamma_r2": gamma_r2, + "mean_R2": mean_R2, + } + half_life = np.log(2) / gamma + cost, logLL, _param_ranges, X_data, X_fit_data = ( + None, + None, + None, + X_data, + K_fit, + ) + + return ( + Estm_df, + half_life, + cost, + logLL, + _param_ranges, + X_data, + X_fit_data, + ) + class KineticsStormDynamics(LabeledDynamics): """Stochastic transient dynamics for the kinetic experiment with kinetic assumption. This includes three stochastic models. In Model 1, only transcription and mRNA degradation were considered. In Model 2, we considered transcription, splicing, and spliced mRNA degradation. And in Model 3, we considered the switching of gene expression states, transcription in the active state, and mRNA degradation.""" + def estimate_params_utils(self, params_est_kwargs): + ( + subset_adata, + data_type, + return_ntr, + ) = params_est_kwargs["subset_adata"], params_est_kwargs["data_type"], params_est_kwargs["return_ntr"] + time = subset_adata.obs[self.tkey].astype("float").values + if self.has_splicing: + # Initialization based on the steady-state assumption + layers_smoothed = ["M_u", "M_s", "M_t", "M_n"] + U_smoothed, S_smoothed, Total_smoothed, New_smoothed = ( + subset_adata.layers[layers_smoothed[0]].T, + subset_adata.layers[layers_smoothed[1]].T, + subset_adata.layers[layers_smoothed[2]].T, + subset_adata.layers[layers_smoothed[3]].T, + ) + + US_smoothed, S2_smoothed = ( + subset_adata.layers["M_us"].T, + subset_adata.layers["M_ss"].T, + ) + (gamma_k, _, _, _,) = fit_slope_stochastic(S_smoothed, U_smoothed, US_smoothed, S2_smoothed, + perc_left=None, perc_right=5) + (gamma_init, _, _, _, _) = lin_reg_gamma_synthesis(Total_smoothed, New_smoothed, time, perc_right=5) + beta_init = gamma_init / gamma_k # gamma_k = gamma / beta + + # Read raw counts + layers_raw = ["ul", "sl"] + UL_raw, SL_raw = ( + subset_adata.layers[layers_raw[0]].T, + subset_adata.layers[layers_raw[1]].T, + ) + + # Read smoothed values based CSP type distribution for cell-specific parameter inference + UL_smoothed_CSP, SL_smoothed_CSP = ( + subset_adata.layers['M_CSP_ul'].T, + subset_adata.layers['M_CSP_sl'].T, + ) + + # Parameters inference based on maximum likelihood estimation + cell_total = subset_adata.obs['initial_cell_size'].astype("float").values + # Independent cell-specific Poisson + (gamma_s, gamma_r2, beta, gamma_t, gamma_r2_raw, alpha) = storm.mle_independent_cell_specific_poisson \ + (UL_raw, SL_raw, time, gamma_init, beta_init, cell_total, Total_smoothed, S_smoothed) + gamma_k = gamma_s / beta + gamma_b = np.zeros_like(gamma_k) + + # Cell specific parameters (fixed gamma_s) + alpha, beta = storm.cell_specific_alpha_beta(UL_smoothed_CSP, SL_smoothed_CSP, time, gamma_s, beta) + + # # Cell specific parameters(fixed gamma_t) + # k = 1 - np.exp(-gamma_t[:, None] * time[None, :]) + # alpha = csr_matrix(gamma_t[:, None]).multiply(UL_smoothed_CSP+SL_smoothed_CSP).multiply(1 / k) + + Estm_df = { + "alpha": alpha, + "beta": beta, + "gamma_k": gamma_k, + "gamma_b": gamma_b, + # "gamma_k_r2": gamma_all_r2, + # "gamma_logLL": gamma_all_logLL, + "gamma": gamma_s, + "gamma_r2": gamma_r2, + # "mean_R2": mean_R2, + "gamma_t": gamma_t, + "gamma_r2_raw": gamma_r2_raw, + } + half_life = np.log(2) / gamma_s + cost, logLL, _param_ranges, X_data, X_fit_data = ( + None, + None, + None, + None, + None, + ) + + return ( + Estm_df, + half_life, + cost, + logLL, + _param_ranges, + X_data, + X_fit_data, + ) + else: + # Initialization based on the steady-state assumption + layers_smoothed = ["M_t", "M_n"] + Total_smoothed, New_smoothed = ( + subset_adata.layers[layers_smoothed[0]].T, + subset_adata.layers[layers_smoothed[1]].T, + ) + (gamma_init, _, _, _, _,) = lin_reg_gamma_synthesis(Total_smoothed, New_smoothed, time, + perc_right=5) + + # Read raw counts + layers_raw = ["total", "new"] + Total_raw, New_raw = ( + subset_adata.layers[layers_raw[0]].T, + subset_adata.layers[layers_raw[1]].T, + ) + + # Read smoothed values based CSP type distribution for cell-specific parameter inference + layers_smoothed_CSP = ["M_CSP_t", "M_CSP_n"] + Total_smoothed_CSP, New_smoothed_CSP = ( + subset_adata.layers[layers_smoothed_CSP[0]].T, + subset_adata.layers[layers_smoothed_CSP[1]].T, + ) + + # Parameters inference based on maximum likelihood estimation + cell_total = subset_adata.obs['initial_cell_size'].astype("float").values + + if "storm-csp" == self.est_method: + gamma, gamma_r2, gamma_r2_raw, alpha = storm.mle_cell_specific_poisson(New_raw, time, + gamma_init, cell_total) + elif "storm-cszip" == self.est_method: + gamma, prob_off, gamma_r2, gamma_r2_raw, alpha = storm.mle_cell_specific_zero_inflated_poisson( + New_raw, time, gamma_init, cell_total) + alpha = alpha * (1 - prob_off) # gene-wise alpha + else: + raise NotImplementedError("This method has not been implemented.") + + k = 1 - np.exp(-gamma[:, None] * time[None, :]) + alpha = csr_matrix(gamma[:, None]).multiply(New_smoothed_CSP).multiply(1 / k) # gene-cell-wise alpha + + Estm_df = { + "alpha": alpha, + "gamma": gamma, + "gamma_k": gamma, # required for phase_potrait + "gamma_r2": gamma_r2, + "gamma_r2_raw": gamma_r2_raw, + # "mean_R2": mean_R2, + "prob_off": prob_off if "cszip" in self.est_method else None + } + half_life = np.log(2) / gamma + cost, logLL, _param_ranges, X_data, X_fit_data = ( + None, + None, + None, + None, # X_data, + None, # K_fit, + ) + + return ( + Estm_df, + half_life, + cost, + logLL, + _param_ranges, + X_data, + X_fit_data, + ) def calculate_vel_U( self, @@ -1111,7 +1382,7 @@ def calculate_vel_T( else: return vel.vel_u(T) - def calculate_velocity( + def calculate_vels( self, vel: Velocity, U: Union[ndarray, csr_matrix], @@ -1131,10 +1402,594 @@ def calculate_velocity( return vel_U, vel_S, vel_N, vel_T +class DirectKineticsDynamics(KineticsDynamics): + def estimate_params_utils(self, params_est_kwargs): + ( + subset_adata, + data_type, + return_ntr, + ) = params_est_kwargs["subset_adata"], params_est_kwargs["data_type"], params_est_kwargs["return_ntr"] + has_switch = True + param_rngs = {} + time = subset_adata.obs[self.tkey].astype("float").values + if self.has_splicing and self.splicing_labeling: + layers = ( + ["M_ul", "M_sl", "M_uu", "M_su"] + if ("M_ul" in subset_adata.layers.keys() and data_type == "smoothed") + else ["X_ul", "X_sl", "X_uu", "X_su"] + ) + + if self.model.lower() in ["deterministic", "stochastic"]: + layer_u = "M_ul" if ("M_ul" in subset_adata.layers.keys() and data_type == "smoothed") else "X_ul" + layer_s = "M_sl" if ("M_ul" in subset_adata.layers.keys() and data_type == "smoothed") else "X_sl" + + X, X_raw = prepare_data_has_splicing( + subset_adata, + subset_adata.var.index, + time, + layer_u=layer_u, + layer_s=layer_s, + total_layers=layers, + ) + elif self.model.startswith("mixture"): + X, _, X_raw = prepare_data_deterministic( + subset_adata, + subset_adata.var.index, + time, + layers=layers, + total_layers=layers, + ) + + if self.model.lower() == "deterministic": + X = [X[i][[0, 1], :] for i in range(len(X))] + _param_ranges = { + "alpha": [0, 1000], + "beta": [0, 1000], + "gamma": [0, 1000], + } + x0 = {"u0": [0, 1000], "s0": [0, 1000]} + Est, _ = Estimation_DeterministicKin, Deterministic + elif self.model.lower() == "stochastic": + x0 = { + "u0": [0, 1000], + "s0": [0, 1000], + "uu0": [0, 1000], + "ss0": [0, 1000], + "us0": [0, 1000], + } + + if has_switch: + _param_ranges = { + "a": [0, 1000], + "b": [0, 1000], + "alpha_a": [0, 1000], + "alpha_i": 0, + "beta": [0, 1000], + "gamma": [0, 1000], + } + Est, _ = Estimation_MomentKin, Moments + else: + _param_ranges = { + "alpha": [0, 1000], + "beta": [0, 1000], + "gamma": [0, 1000], + } + + Est, _ = ( + Estimation_MomentKinNoSwitch, + Moments_NoSwitching, + ) + elif self.model.lower() == "mixture": + _param_ranges = { + "alpha": [0, 1000], + "alpha_2": [0, 0], + "beta": [0, 1000], + "gamma": [0, 1000], + } + x0 = { + "ul0": [0, 0], + "sl0": [0, 0], + "uu0": [0, 1000], + "su0": [0, 1000], + } + + Est = Mixture_KinDeg_NoSwitching(Deterministic(), Deterministic()) + elif self.model.lower() == "mixture_deterministic_stochastic": + X, X_raw = prepare_data_mix_has_splicing( + subset_adata, + subset_adata.var.index, + time, + layer_u=layers[2], + layer_s=layers[3], + layer_ul=layers[0], + layer_sl=layers[1], + total_layers=layers, + mix_model_indices=[0, 1, 5, 6, 7, 8, 9], + ) + + _param_ranges = { + "alpha": [0, 1000], + "alpha_2": [0, 0], + "beta": [0, 1000], + "gamma": [0, 1000], + } + x0 = { + "ul0": [0, 0], + "sl0": [0, 0], + "u0": [0, 1000], + "s0": [0, 1000], + "uu0": [0, 1000], + "ss0": [0, 1000], + "us0": [0, 1000], + } + Est = Mixture_KinDeg_NoSwitching(Deterministic(), Moments_NoSwitching()) + elif self.model.lower() == "mixture_stochastic_stochastic": + _param_ranges = { + "alpha": [0, 1000], + "alpha_2": [0, 0], + "beta": [0, 1000], + "gamma": [0, 1000], + } + X, X_raw = prepare_data_mix_has_splicing( + subset_adata, + subset_adata.var.index, + time, + layer_u=layers[2], + layer_s=layers[3], + layer_ul=layers[0], + layer_sl=layers[1], + total_layers=layers, + mix_model_indices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + ) + x0 = { + "ul0": [0, 1000], + "sl0": [0, 1000], + "ul_ul0": [0, 1000], + "sl_sl0": [0, 1000], + "ul_sl0": [0, 1000], + "u0": [0, 1000], + "s0": [0, 1000], + "uu0": [0, 1000], + "ss0": [0, 1000], + "us0": [0, 1000], + } + Est = Mixture_KinDeg_NoSwitching(Moments_NoSwitching(), Moments_NoSwitching()) + else: + raise NotImplementedError( + f"model {self.model} with kinetic assumption is not implemented. " + f"current supported models for kinetics experiments include: stochastic, deterministic, mixture," + f"mixture_deterministic_stochastic or mixture_stochastic_stochastic" + ) + else: + total_layer = "M_t" if ("M_t" in subset_adata.layers.keys() and data_type == "smoothed") else "X_total" + + if self.model.lower() in ["deterministic", "stochastic"]: + layer = "M_n" if ("M_n" in subset_adata.layers.keys() and data_type == "smoothed") else "X_new" + X, X_raw = prepare_data_no_splicing( + subset_adata, + subset_adata.var.index, + time, + layer=layer, + total_layer=total_layer, + ) + elif self.model.lower().startswith("mixture"): + layers = ( + ["M_n", "M_t"] + if ("M_n" in subset_adata.layers.keys() and data_type == "smoothed") + else ["X_new", "X_total"] + ) + + X, _, X_raw = prepare_data_deterministic( + subset_adata, + subset_adata.var.index, + time, + layers=layers, + total_layers=total_layer, + ) + + if self.model.lower() == "deterministic": + X = [X[i][0, :] for i in range(len(X))] + _param_ranges = { + "alpha": [0, 1000], + "gamma": [0, 1000], + } + x0 = {"u0": [0, 1000]} + Est, _ = ( + Estimation_DeterministicKinNosp, + Deterministic_NoSplicing, + ) + elif self.model.lower() == "stochastic": + x0 = { + "u0": [0, 1000], + "uu0": [0, 1000], + } + if has_switch: + _param_ranges = { + "a": [0, 1000], + "b": [0, 1000], + "alpha_a": [0, 1000], + "alpha_i": 0, + "gamma": [0, 1000], + } + Est, _ = Estimation_MomentKinNosp, Moments_Nosplicing + else: + _param_ranges = { + "alpha": [0, 1000], + "gamma": [0, 1000], + } + Est, _ = ( + Estimation_MomentKinNoSwitchNoSplicing, + Moments_NoSwitchingNoSplicing, + ) + elif self.model.lower() == "mixture": + _param_ranges = { + "alpha": [0, 1000], + "alpha_2": [0, 0], + "gamma": [0, 1000], + } + x0 = {"u0": [0, 0], "o0": [0, 1000]} + Est = Mixture_KinDeg_NoSwitching(Deterministic_NoSplicing(), Deterministic_NoSplicing()) + elif self.model.lower() == "mixture_deterministic_stochastic": + X, X_raw = prepare_data_mix_no_splicing( + subset_adata, + subset_adata.var.index, + time, + layer_n=layers[0], + layer_t=layers[1], + total_layer=total_layer, + mix_model_indices=[0, 2, 3], + ) + + _param_ranges = { + "alpha": [0, 1000], + "alpha_2": [0, 0], + "gamma": [0, 1000], + } + x0 = {"u0": [0, 1000], "o0": [0, 1000], "oo0": [0, 1000]} + Est = Mixture_KinDeg_NoSwitching( + Deterministic_NoSplicing(), + Moments_NoSwitchingNoSplicing(), + ) + elif self.model.lower() == "mixture_stochastic_stochastic": + X, X_raw = prepare_data_mix_no_splicing( + subset_adata, + subset_adata.var.index, + time, + layer_n=layers[0], + layer_t=layers[1], + total_layer=total_layer, + mix_model_indices=[0, 1, 2, 3], + ) + + _param_ranges = { + "alpha": [0, 1000], + "alpha_2": [0, 0], + "gamma": [0, 1000], + } + x0 = { + "u0": [0, 1000], + "uu0": [0, 1000], + "o0": [0, 1000], + "oo0": [0, 1000], + } + Est = Mixture_KinDeg_NoSwitching( + Moments_NoSwitchingNoSplicing(), + Moments_NoSwitchingNoSplicing(), + ) + else: + raise NotImplementedError( + f"model {self.model} with kinetic assumption is not implemented. " + f"current supported models for kinetics experiments include: stochastic, deterministic, " + f"mixture, mixture_deterministic_stochastic or mixture_stochastic_stochastic" + ) + _param_ranges = update_dict(_param_ranges, param_rngs) + x0_ = np.vstack([ran for ran in x0.values()]).T + + n_genes = subset_adata.n_vars + cost, logLL = np.zeros(n_genes), np.zeros(n_genes) + all_keys = list(_param_ranges.keys()) + list(x0.keys()) + all_keys = [cur_key for cur_key in all_keys if cur_key != "alpha_i"] + half_life, Estm = np.zeros(n_genes), [None] * n_genes + X_data, X_fit_data = [None] * n_genes, [None] * n_genes + if self.experiment_type: + popt = [None] * n_genes + + main_debug("model: %s, experiment_type: %s" % (self.model, self.experiment_type)) + for i_gene in tqdm(range(n_genes), desc="estimating kinetic-parameters using kinetic model"): + if self.model.lower().startswith("mixture"): + estm = Est + if self.model.lower() == "mixture": + cur_X_data = np.vstack([X[i_layer][i_gene] for i_layer in range(len(X))]) + if issparse(X_raw[0]): + cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene].A for i_layer in range(len(X))]) + else: + cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene] for i_layer in range(len(X))]) + else: + cur_X_data = X[i_gene] + cur_X_raw = X_raw[i_gene] + + if issparse(cur_X_raw[0, 0]): + cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) + + _, cost[i_gene] = estm.auto_fit(np.unique(time), cur_X_data) + ( + model_1, + model_2, + kinetic_parameters, + mix_x0, + ) = estm.export_dictionary().values() + tmp = list(kinetic_parameters.values()) + tmp.extend(mix_x0) + Estm[i_gene] = tmp + else: + cur_X_data, cur_X_raw = X[i_gene], X_raw[i_gene] + + if self.has_splicing: + alpha0 = guestimate_alpha(np.sum(cur_X_data, 0), np.unique(time)) + else: + alpha0 = ( + guestimate_alpha(cur_X_data, np.unique(time)) + if cur_X_data.ndim == 1 + else guestimate_alpha(cur_X_data[0], np.unique(time)) + ) + + if self.model.lower() == "stochastic": + _param_ranges.update({"alpha_a": [0, alpha0 * 10]}) + elif self.model.lower() == "deterministic": + _param_ranges.update({"alpha": [0, alpha0 * 10]}) + param_ranges = [ran for ran in _param_ranges.values()] + + estm = Est(*param_ranges, x0=x0_) if "x0" in inspect.getfullargspec(Est) else Est(*param_ranges) + _, cost[i_gene] = estm.fit_lsq(np.unique(time), cur_X_data, **self.est_kwargs) + if self.model.lower() == "deterministic": + Estm[i_gene] = estm.export_parameters() + else: + tmp = np.ma.array(estm.export_parameters(), mask=False) + tmp.mask[3] = True + Estm[i_gene] = tmp.compressed() + + if issparse(cur_X_raw[0, 0]): + cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) + + X_data[i_gene] = cur_X_data + if self.model.lower().startswith("mixture"): + X_fit_data[i_gene] = estm.simulator.x.T + X_fit_data[i_gene][estm.model1.n_species:] *= estm.scale + else: + if hasattr(estm, "extract_data_from_simulator"): + X_fit_data[i_gene] = estm.extract_data_from_simulator() + else: + X_fit_data[i_gene] = estm.simulator.x.T + + half_life[i_gene] = np.log(2) / Estm[i_gene][-1] + + if self.model.lower().startswith("mixture"): + species = [0, 1, 2, 3] if self.has_splicing else [0, 1] + gof = GoodnessOfFit(estm.export_model(), params=estm.export_parameters()) + gof.prepare_data(time, cur_X_raw.T, species=species, normalize=True) + else: + gof = GoodnessOfFit( + estm.export_model(), + params=estm.export_parameters(), + x0=estm.simulator.x0, + ) + gof.prepare_data(time, cur_X_raw.T, normalize=True) + + logLL[i_gene] = gof.calc_mean_squared_deviation() # .calc_gaussian_loglikelihood() + + Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) + + return Estm_df, half_life, cost, logLL, _param_ranges, X_data, X_fit_data + + class DegradationDynamics(LabeledDynamics): """Dynamics model for the degradation experiment. In degradation experiment, samples are chased after an extended 4sU (or other nucleotide analog) labeling period and the wash-out to observe the decay of the abundance of the (labeled) unspliced and spliced RNA decay over time.""" + def estimate_params_utils(self, params_est_kwargs): + ( + subset_adata, + data_type, + return_ntr, + ) = params_est_kwargs["subset_adata"], params_est_kwargs["data_type"], params_est_kwargs["return_ntr"] + has_switch = True + param_rngs = {} + time = subset_adata.obs[self.tkey].astype("float").values + if self.has_splicing and self.splicing_labeling: + layers = ( + ["M_ul", "M_sl", "M_uu", "M_su"] + if ("M_ul" in subset_adata.layers.keys() and data_type == "smoothed") + else ["X_ul", "X_sl", "X_uu", "X_su"] + ) + + if self.model.lower() in ["deterministic", "stochastic"]: + layer_u = "M_ul" if ("M_ul" in subset_adata.layers.keys() and data_type == "smoothed") else "X_ul" + layer_s = "M_sl" if ("M_sl" in subset_adata.layers.keys() and data_type == "smoothed") else "X_sl" + + X, X_raw = prepare_data_has_splicing( + subset_adata, + subset_adata.var.index, + time, + layer_u=layer_u, + layer_s=layer_s, + total_layers=layers, + return_ntr=return_ntr, + ) + elif self.model.lower().startswith("mixture"): + X, _, X_raw = prepare_data_deterministic( + subset_adata, + subset_adata.var.index, + time, + layers=layers, + total_layers=layers, + return_ntr=return_ntr, + ) + + if self.model.lower() == "deterministic": + X = [X[i][[0, 1], :] for i in range(len(X))] + _param_ranges = { + "beta": [0, 1000], + "gamma": [0, 1000], + } + x0 = { + "u0": [0, 1000], + "s0": [0, 1000], + } + Est, _ = Estimation_DeterministicDeg, Deterministic + elif self.model.lower() == "stochastic": + _param_ranges = { + "beta": [0, 1000], + "gamma": [0, 1000], + } + x0 = { + "u0": [0, 1000], + "s0": [0, 1000], + "uu0": [0, 1000], + "ss0": [0, 1000], + "us0": [0, 1000], + } + Est, _ = Estimation_MomentDeg, Moments_NoSwitching + else: + raise NotImplementedError( + f"model {self.model} with kinetic assumption is not implemented. " + f"current supported models for degradation experiment include: " + f"stochastic, deterministic." + ) + else: + total_layer = "M_t" if ("M_t" in subset_adata.layers.keys() and data_type == "smoothed") else "X_total" + + layer = "M_n" if ("M_n" in subset_adata.layers.keys() and data_type == "smoothed") else "X_new" + X, X_raw = prepare_data_no_splicing( + subset_adata, + subset_adata.var.index, + time, + layer=layer, + total_layer=total_layer, + return_ntr=return_ntr, + ) + + if self.model.lower() == "deterministic": + X = [X[i][0, :] for i in range(len(X))] + _param_ranges = { + "gamma": [0, 10], + } + x0 = {"u0": [0, 1000]} + Est, _ = ( + Estimation_DeterministicDegNosp, + Deterministic_NoSplicing, + ) + elif self.model.lower() == "stochastic": + _param_ranges = { + "gamma": [0, 10], + } + x0 = {"u0": [0, 1000], "uu0": [0, 1000]} + Est, _ = Estimation_MomentDegNosp, Moments_NoSwitchingNoSplicing + else: + raise NotImplementedError( + f"model {self.model} with kinetic assumption is not implemented. " + f"current supported models for degradation experiment include: " + f"stochastic, deterministic.") + _param_ranges = update_dict(_param_ranges, param_rngs) + x0_ = np.vstack([ran for ran in x0.values()]).T + + n_genes = subset_adata.n_vars + cost, logLL = np.zeros(n_genes), np.zeros(n_genes) + all_keys = list(_param_ranges.keys()) + list(x0.keys()) + all_keys = [cur_key for cur_key in all_keys if cur_key != "alpha_i"] + half_life, Estm = np.zeros(n_genes), [None] * n_genes + X_data, X_fit_data = [None] * n_genes, [None] * n_genes + if self.experiment_type: + popt = [None] * n_genes + + main_debug("model: %s, experiment_type: %s" % (self.model, self.experiment_type)) + for i_gene in tqdm(range(n_genes), desc="estimating kinetic-parameters using kinetic model"): + if self.model.lower().startswith("mixture"): + estm = Est + if self.model.lower() == "mixture": + cur_X_data = np.vstack([X[i_layer][i_gene] for i_layer in range(len(X))]) + if issparse(X_raw[0]): + cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene].A for i_layer in range(len(X))]) + else: + cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene] for i_layer in range(len(X))]) + else: + cur_X_data = X[i_gene] + cur_X_raw = X_raw[i_gene] + + if issparse(cur_X_raw[0, 0]): + cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) + + _, cost[i_gene] = estm.auto_fit(np.unique(time), cur_X_data) + ( + model_1, + model_2, + kinetic_parameters, + mix_x0, + ) = estm.export_dictionary().values() + tmp = list(kinetic_parameters.values()) + tmp.extend(mix_x0) + Estm[i_gene] = tmp + else: + estm = Est() + cur_X_data, cur_X_raw = X[i_gene], X_raw[i_gene] + + _, cost[i_gene] = estm.auto_fit(np.unique(time), cur_X_data) + Estm[i_gene] = estm.export_parameters()[1:] + + if issparse(cur_X_raw[0, 0]): + cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) + # model_1, kinetic_parameters, mix_x0 = estm.export_dictionary().values() + # tmp = list(kinetic_parameters.values()) + # tmp.extend(mix_x0) + # Estm[i_gene] = tmp + + X_data[i_gene] = cur_X_data + if self.model.lower().startswith("mixture"): + X_fit_data[i_gene] = estm.simulator.x.T + X_fit_data[i_gene][estm.model1.n_species:] *= estm.scale + else: + if hasattr(estm, "extract_data_from_simulator"): + X_fit_data[i_gene] = estm.extract_data_from_simulator() + else: + X_fit_data[i_gene] = estm.simulator.x.T + + half_life[i_gene] = estm.calc_half_life("gamma") + + if self.model.lower().startswith("mixture"): + species = [0, 1, 2, 3] if self.has_splicing else [0, 1] + gof = GoodnessOfFit(estm.export_model(), params=estm.export_parameters()) + gof.prepare_data(time, cur_X_raw.T, species=species, normalize=True) + else: + gof = GoodnessOfFit( + estm.export_model(), + params=estm.export_parameters(), + x0=estm.simulator.x0, + ) + gof.prepare_data(time, cur_X_raw.T, normalize=True) + + logLL[i_gene] = gof.calc_mean_squared_deviation() # .calc_gaussian_loglikelihood() + + if self.est_method == "twostep" and self.has_splicing: + layers = ["M_u", "M_s"] if ("M_u" in subset_adata.layers.keys() and data_type == "smoothed") else ["X_u", + "X_s"] + U, S = ( + subset_adata.layers[layers[0]].T, + subset_adata.layers[layers[1]].T, + ) + US, S2 = subset_adata.layers["M_us"].T, subset_adata.layers["M_ss"].T + # beta, beta_r2 = lin_reg_gamma_synthesis(U, Ul, time, perc_right=100) + gamma_k, gamma_b, gamma_all_r2, gamma_all_logLL = fit_slope_stochastic( + S, U, US, S2, perc_left=None, perc_right=5 + ) + + Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) + Estm_df["gamma_k"] = gamma_k # gamma_k = gamma / beta + Estm_df["beta"] = Estm_df["gamma"] / gamma_k # gamma_k = gamma / beta + Estm_df["gamma_r2"] = gamma_all_r2 + else: + Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) + + return Estm_df, half_life, cost, logLL, _param_ranges, X_data, X_fit_data + def calculate_vel_U( self, vel: Velocity, @@ -1253,6 +2108,153 @@ def calculate_vels( class MixKineticsDynamics(LabeledDynamics): """Dynamics model for two mix experiment type: mix_kin_deg and mix_pulse_chase.""" + def estimate_params_utils(self, params_est_kwargs): + ( + subset_adata, + data_type, + return_ntr, + ) = params_est_kwargs["subset_adata"], params_est_kwargs["data_type"], params_est_kwargs["return_ntr"] + has_switch = True + param_rngs = {} + time = subset_adata.obs[self.tkey].astype("float").values + total_layer = "M_t" if ("M_t" in subset_adata.layers.keys() and data_type == "smoothed") else "X_total" + + if self.model.lower() in ["deterministic"]: + layer = "M_n" if ("M_n" in subset_adata.layers.keys() and data_type == "smoothed") else "X_new" + X, X_raw = prepare_data_no_splicing( + subset_adata, + subset_adata.var.index, + time, + layer=layer, + total_layer=total_layer, + ) + if self.model.lower() == "deterministic": + X = [X[i][0, :] for i in range(len(X))] + _param_ranges = { + "alpha": [0, 1000], + "gamma": [0, 1000], + } + x0 = {"u0": [0, 1000]} + Est = Estimation_KineticChase + else: + raise NotImplementedError( + f"only `deterministic` model implemented for mix_pulse_chase/mix_kin_deg experiment!" + ) + _param_ranges = update_dict(_param_ranges, param_rngs) + x0_ = np.vstack([ran for ran in x0.values()]).T + + n_genes = subset_adata.n_vars + cost, logLL = np.zeros(n_genes), np.zeros(n_genes) + all_keys = list(_param_ranges.keys()) + list(x0.keys()) + all_keys = [cur_key for cur_key in all_keys if cur_key != "alpha_i"] + half_life, Estm = np.zeros(n_genes), [None] * n_genes + X_data, X_fit_data = [None] * n_genes, [None] * n_genes + if self.experiment_type: + popt = [None] * n_genes + + main_debug("model: %s, experiment_type: %s" % (self.model, self.experiment_type)) + for i_gene in tqdm(range(n_genes), desc="estimating kinetic-parameters using kinetic model"): + if self.model.lower().startswith("mixture"): + estm = Est + if self.model.lower() == "mixture": + cur_X_data = np.vstack([X[i_layer][i_gene] for i_layer in range(len(X))]) + if issparse(X_raw[0]): + cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene].A for i_layer in range(len(X))]) + else: + cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene] for i_layer in range(len(X))]) + else: + cur_X_data = X[i_gene] + cur_X_raw = X_raw[i_gene] + + if issparse(cur_X_raw[0, 0]): + cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) + + _, cost[i_gene] = estm.auto_fit(np.unique(time), cur_X_data) + ( + model_1, + model_2, + kinetic_parameters, + mix_x0, + ) = estm.export_dictionary().values() + tmp = list(kinetic_parameters.values()) + tmp.extend(mix_x0) + Estm[i_gene] = tmp + else: + estm = Est() + cur_X_data, cur_X_raw = X[i_gene], X_raw[i_gene] + + popt[i_gene], cost[i_gene] = estm.auto_fit(np.unique(time), cur_X_data) + Estm[i_gene] = estm.export_parameters() + + if issparse(cur_X_raw[0, 0]): + cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) + # model_1, kinetic_parameters, mix_x0 = estm.export_dictionary().values() + # tmp = list(kinetic_parameters.values()) + # tmp.extend(mix_x0) + # Estm[i_gene] = tmp + + X_data[i_gene] = cur_X_data + if self.model.lower().startswith("mixture"): + X_fit_data[i_gene] = estm.simulator.x.T + X_fit_data[i_gene][estm.model1.n_species:] *= estm.scale + else: + # kinetic chase simulation + kinetic_chase = estm.simulator.x.T + # hidden x + tt, h = estm.simulator.calc_init_conc() + + X_fit_data[i_gene] = [kinetic_chase, [tt, h]] + + half_life[i_gene] = estm.calc_half_life("gamma") + + if self.model.lower().startswith("mixture"): + species = [0, 1, 2, 3] if self.has_splicing else [0, 1] + gof = GoodnessOfFit(estm.export_model(), params=estm.export_parameters()) + gof.prepare_data(time, cur_X_raw.T, species=species, normalize=True) + else: + gof = GoodnessOfFit( + estm.export_model(), + params=estm.export_parameters(), + x0=estm.simulator.x0, + ) + gof.prepare_data(time, cur_X_raw.T, normalize=True) + + logLL[i_gene] = gof.calc_mean_squared_deviation() # .calc_gaussian_loglikelihood() + + if self.est_method == "twostep": + if self.has_splicing: + layers = ( + ["M_u", "M_s"] if ("M_u" in subset_adata.layers.keys() and data_type == "smoothed") else ["X_u", + "X_s"] + ) + U, S = ( + subset_adata.layers[layers[0]].T, + subset_adata.layers[layers[1]].T, + ) + US, S2 = ( + subset_adata.layers["M_us"].T, + subset_adata.layers["M_ss"].T, + ) + # beta, beta_r2 = lin_reg_gamma_synthesis(U, Ul, time, perc_right=100) + ( + gamma_k, + gamma_b, + gamma_all_r2, + gamma_all_logLL, + ) = fit_slope_stochastic(S, U, US, S2, perc_left=None, perc_right=5) + + Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) + Estm_df["gamma_k"] = gamma_k # gamma_k = gamma / beta + Estm_df["beta"] = Estm_df["gamma"] / gamma_k # gamma_k = gamma / beta + Estm_df["gamma_r2"] = gamma_all_r2 + else: + Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) + Estm_df["gamma_k"] = Estm_df["gamma"] # fix a bug in pl.dynamics + else: + Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) + + return Estm_df, half_life, cost, logLL, _param_ranges, X_data, X_fit_data + def calculate_vel_U( self, vel: Velocity, From 8196ce0fff1225e152560027a717c2e07f5f2639 Mon Sep 17 00:00:00 2001 From: sichao Date: Wed, 26 Jul 2023 11:19:56 -0400 Subject: [PATCH 27/31] create ss fit functions --- dynamo/estimation/csc/velocity.py | 1037 +++++++++++++++++++++++++++++ 1 file changed, 1037 insertions(+) diff --git a/dynamo/estimation/csc/velocity.py b/dynamo/estimation/csc/velocity.py index e2d292f06..498a30aaf 100755 --- a/dynamo/estimation/csc/velocity.py +++ b/dynamo/estimation/csc/velocity.py @@ -1603,6 +1603,1043 @@ def fit( _, # self.aux_param["delta_logLL"], ) = (delta, delta_intercept, delta_r2, delta_logLL) + def fit_protein(self, intercept, perc_left, perc_right, cores): + if np.all(self._exist_data("p", "su")): + ind_for_proteins = self.ind_for_proteins + n_genes = len(ind_for_proteins) if ind_for_proteins is not None else 0 + + if self.asspt_prot.lower() == "ss" and n_genes > 0: + self.parameters["eta"] = np.ones(n_genes) + (delta, delta_intercept, delta_r2, delta_logLL,) = ( + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + ) + + s = ( + self.data["su"][ind_for_proteins] + self.data["sl"][ind_for_proteins] + if self._exist_data("sl") + else self.data["su"][ind_for_proteins] + ) + if cores == 1: + for i in tqdm(range(n_genes), desc="estimating delta"): + ( + delta[i], + delta_intercept[i], + _, + delta_r2[i], + _, + delta_logLL[i], + ) = self.fit_gamma_steady_state( + s[i], + self.data["p"][i], + intercept, + perc_left, + perc_right, + ) + else: + pool = ThreadPool(cores) + res = pool.starmap( + self.fit_gamma_steady_state, + zip( + s, + self.data["p"], + itertools.repeat(intercept), + itertools.repeat(perc_left), + itertools.repeat(perc_right), + ), + ) + pool.close() + pool.join() + (delta, delta_intercept, _, delta_r2, _, delta_logLL) = zip(*res) + (delta, delta_intercept, delta_r2, delta_logLL) = ( + np.array(delta), + np.array(delta_intercept), + np.array(delta_r2), + np.array(delta_logLL), + ) + ( + self.parameters["delta"], + self.aux_param["delta_intercept"], + self.aux_param["delta_r2"], + _, # self.aux_param["delta_logLL"], + ) = (delta, delta_intercept, delta_r2, delta_logLL) + + def fit_conventional_deterministic( + self, + intercept=False, + perc_left=None, + perc_right=5, + ): + n_genes = self.get_n_genes() + cores = max(1, int(self.cores)) + if np.all(self._exist_data("uu", "su")): + self.parameters["beta"] = np.ones(n_genes) + gamma, gamma_intercept, gamma_r2, gamma_logLL = ( + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + ) + U = self.data["uu"] if self.data["ul"] is None else self.data["uu"] + self.data["ul"] + S = self.data["su"] if self.data["sl"] is None else self.data["su"] + self.data["sl"] + if cores == 1: + for i in tqdm(range(n_genes), desc="estimating gamma"): + ( + gamma[i], + gamma_intercept[i], + _, + gamma_r2[i], + _, + gamma_logLL[i], + ) = self.fit_gamma_steady_state(U[i], S[i], intercept, perc_left, perc_right) + else: + pool = ThreadPool(cores) + res = pool.starmap( + self.fit_gamma_steady_state, + zip( + U, + S, + itertools.repeat(intercept), + itertools.repeat(perc_left), + itertools.repeat(perc_right), + ), + ) + pool.close() + pool.join() + ( + gamma, + gamma_intercept, + _, + gamma_r2, + _, + gamma_logLL, + ) = zip(*res) + (gamma, gamma_intercept, gamma_r2, gamma_logLL) = ( + np.array(gamma), + np.array(gamma_intercept), + np.array(gamma_r2), + np.array(gamma_logLL), + ) + ( + self.parameters["gamma"], + self.aux_param["gamma_intercept"], + self.aux_param["gamma_r2"], + self.aux_param["gamma_logLL"], + ) = (gamma, gamma_intercept, gamma_r2, gamma_logLL) + elif np.all(self._exist_data("uu", "ul")): + self.parameters["beta"] = np.ones(n_genes) + gamma, gamma_intercept, gamma_r2, gamma_logLL = ( + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + ) + U = self.data["ul"] + S = self.data["uu"] + self.data["ul"] + if cores == 1: + for i in tqdm(range(n_genes), desc="estimating gamma"): + ( + gamma[i], + gamma_intercept[i], + _, + gamma_r2[i], + _, + gamma_logLL[i], + ) = self.fit_gamma_steady_state(U[i], S[i], intercept, perc_left, perc_right) + else: + pool = ThreadPool(cores) + res = pool.starmap( + self.fit_gamma_steady_state, + zip( + U, + S, + itertools.repeat(intercept), + itertools.repeat(perc_left), + itertools.repeat(perc_right), + ), + ) + pool.close() + pool.join() + ( + gamma, + gamma_intercept, + _, + gamma_r2, + _, + gamma_logLL, + ) = zip(*res) + (gamma, gamma_intercept, gamma_r2, gamma_logLL) = ( + np.array(gamma), + np.array(gamma_intercept), + np.array(gamma_r2), + np.array(gamma_logLL), + ) + ( + self.parameters["gamma"], + self.aux_param["gamma_intercept"], + self.aux_param["gamma_r2"], + self.aux_param["gamma_logLL"], + ) = (gamma, gamma_intercept, gamma_r2, gamma_logLL) + self.fit_protein(intercept=intercept, perc_left=perc_left, perc_right=perc_right, cores=cores) + + def fit_conventional_stochastic( + self, + intercept=False, + perc_left=None, + perc_right=5, + ): + n_genes = self.get_n_genes() + cores = max(1, int(self.cores)) + if np.all(self._exist_data("uu", "su")): + self.parameters["beta"] = np.ones(n_genes) + gamma, gamma_intercept, gamma_r2, gamma_logLL, bs, bf = ( + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + ) + U = self.data["uu"] if self.data["ul"] is None else self.data["uu"] + self.data["ul"] + S = self.data["su"] if self.data["sl"] is None else self.data["su"] + self.data["sl"] + US = ( + self.data["us"] + if self.data["us"] is not None + else calc_2nd_moment(U.T, S.T, self.conn, mX=U.T, mY=S.T).T + ) + S2 = ( + self.data["s2"] + if self.data["s2"] is not None + else calc_2nd_moment(S.T, S.T, self.conn, mX=S.T, mY=S.T).T + ) + if cores == 1: + for i in tqdm(range(n_genes), desc="estimating gamma"): + ( + gamma[i], + gamma_intercept[i], + _, + gamma_r2[i], + _, + gamma_logLL[i], + bs[i], + bf[i], + ) = self.fit_gamma_stochastic( + self.est_method, + U[i], + S[i], + US[i], + S2[i], + perc_left=perc_left, + perc_right=perc_right, + normalize=True, + ) + else: + pool = ThreadPool(cores) + res = pool.starmap( + self.fit_gamma_stochastic, + zip( + itertools.repeat(self.est_method), + U, + S, + US, + S2, + itertools.repeat(perc_left), + itertools.repeat(perc_right), + itertools.repeat(True), + ), + ) + pool.close() + pool.join() + ( + gamma, + gamma_intercept, + _, + gamma_r2, + _, + gamma_logLL, + bs, + bf, + ) = zip(*res) + (gamma, gamma_intercept, gamma_r2, gamma_logLL, bs, bf,) = ( + np.array(gamma), + np.array(gamma_intercept), + np.array(gamma_r2), + np.array(gamma_logLL), + np.array(bs), + np.array(bf), + ) + ( + self.parameters["gamma"], + self.aux_param["gamma_intercept"], + self.aux_param["gamma_r2"], + self.aux_param["gamma_logLL"], + self.aux_param["bs"], + self.aux_param["bf"], + ) = (gamma, gamma_intercept, gamma_r2, gamma_logLL, bs, bf) + elif np.all(self._exist_data("uu", "ul")): + self.parameters["beta"] = np.ones(n_genes) + gamma, gamma_intercept, gamma_r2, gamma_logLL, bs, bf = ( + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + ) + U = self.data["ul"] + S = self.data["uu"] + self.data["ul"] + US = ( + self.data["us"] + if self.data["us"] is not None + else calc_2nd_moment(U.T, S.T, self.conn, mX=U.T, mY=S.T).T + ) + S2 = ( + self.data["s2"] + if self.data["s2"] is not None + else calc_2nd_moment(S.T, S.T, self.conn, mX=S.T, mY=S.T).T + ) + if cores == 1: + for i in tqdm(range(n_genes), desc="estimating gamma"): + ( + gamma[i], + gamma_intercept[i], + _, + gamma_r2[i], + _, + gamma_logLL[i], + bs[i], + bf[i], + ) = self.fit_gamma_stochastic( + self.est_method, + U[i], + S[i], + US[i], + S2[i], + perc_left=perc_left, + perc_right=perc_right, + normalize=True, + ) + else: + pool = ThreadPool(cores) + res = pool.starmap( + self.fit_gamma_stochastic, + zip( + itertools.repeat(self.est_method), + U, + S, + US, + S2, + itertools.repeat(perc_left), + itertools.repeat(perc_right), + itertools.repeat(True), + ), + ) + pool.close() + pool.join() + ( + gamma, + gamma_intercept, + _, + gamma_r2, + _, + gamma_logLL, + bs, + bf, + ) = zip(*res) + (gamma, gamma_intercept, gamma_r2, gamma_logLL, bs, bf,) = ( + np.array(gamma), + np.array(gamma_intercept), + np.array(gamma_r2), + np.array(gamma_logLL), + np.array(bs), + np.array(bf), + ) + + ( + self.parameters["gamma"], + self.aux_param["gamma_intercept"], + self.aux_param["gamma_r2"], + self.aux_param["gamma_logLL"], + self.aux_param["bs"], + self.aux_param["bf"], + ) = (gamma, gamma_intercept, gamma_r2, gamma_logLL, bs, bf) + + self.fit_protein(intercept=intercept, perc_left=perc_left, perc_right=perc_right, cores=cores) + + def fit_oneshot( + self, + intercept=False, + perc_left=None, + perc_right=5, + clusters=None, + one_shot_method="combined", + ): + n_genes = self.get_n_genes() + cores = max(1, int(self.cores)) + if len(np.unique(self.t)) > 1: + if np.all(self._exist_data("ul", "uu", "su")): + if not self._exist_parameter("beta"): + warn("beta & gamma estimation: only works when there're at least 2 time points.") + uu_m, uu_v, t_uniq = calc_12_mom_labeling(self.data["uu"], self.t) + su_m, su_v, _ = calc_12_mom_labeling(self.data["su"], self.t) + + ( + self.parameters["beta"], + self.parameters["gamma"], + self.aux_param["uu0"], + self.aux_param["su0"], + ) = self.fit_beta_gamma_lsq(t_uniq, uu_m, su_m) + # alpha estimation + ul_m, ul_v, t_uniq = calc_12_mom_labeling(self.data["ul"], self.t) + alpha = np.zeros(n_genes) + # let us only assume one alpha for each gene in all cells + if cores == 1: + for i in tqdm(range(n_genes), desc="estimating alpha"): + # for j in range(len(self.data['ul'][i])): + alpha[i] = fit_alpha_synthesis(t_uniq, ul_m[i], self.parameters["beta"][i]) + else: + pool = ThreadPool(cores) + alpha = pool.starmap( + fit_alpha_synthesis, + zip( + itertools.repeat(t_uniq), + ul_m, + self.parameters["beta"], + ), + ) + pool.close() + pool.join() + alpha = np.array(alpha) + self.parameters["alpha"] = alpha + elif np.all(self._exist_data("ul", "uu")): + n_genes = self.data["uu"].shape[0] # self.get_n_genes(data=U) + u0, gamma = np.zeros(n_genes), np.zeros(n_genes) + uu_m, uu_v, t_uniq = calc_12_mom_labeling(self.data["uu"], self.t) + for i in tqdm(range(n_genes), desc="estimating gamma"): + try: + gamma[i], u0[i] = fit_first_order_deg_lsq(t_uniq, uu_m[i]) + except: + gamma[i], u0[i] = 0, 0 + self.parameters["gamma"], self.aux_param["uu0"] = gamma, u0 + alpha = np.zeros(n_genes) + # let us only assume one alpha for each gene in all cells + ul_m, ul_v, _ = calc_12_mom_labeling(self.data["ul"], self.t) + if cores == 1: + for i in tqdm(range(n_genes), desc="estimating gamma"): + # for j in range(len(self.data['ul'][i])): + alpha[i] = fit_alpha_synthesis(t_uniq, ul_m[i], self.parameters["gamma"][i]) + else: + pool = ThreadPool(cores) + alpha = pool.starmap( + fit_alpha_synthesis, + zip( + itertools.repeat(t_uniq), + ul_m, + self.parameters["gamma"], + ), + ) + pool.close() + pool.join() + alpha = np.array(alpha) + self.parameters["alpha"] = alpha + # alpha: one-shot + # 'one_shot' + else: + t_uniq = np.unique(self.t) + if len(t_uniq) > 1: + raise Exception( + "By definition, one-shot experiment should involve only one time point measurement!" + ) + # calculate when having splicing or no splicing + if self.model.lower() == "deterministic": + if np.all(self._exist_data("ul", "uu", "su")): + if self._exist_parameter("beta", "gamma").all(): + self.parameters["alpha"] = self.fit_alpha_oneshot( + self.t, + self.data["ul"], + self.parameters["beta"], + clusters, + ) + else: + beta, gamma, U0, S0 = ( + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + ) + for i in range( + n_genes + ): # can also use the two extreme time points and apply sci-fate like approach. + S, U = ( + self.data["su"][i] + self.data["sl"][i], + self.data["uu"][i] + self.data["ul"][i], + ) + + S0[i], gamma[i] = ( + np.mean(S), + solve_gamma(np.max(self.t), self.data["su"][i], S), + ) + U0[i], beta[i] = ( + np.mean(U), + solve_gamma(np.max(self.t), self.data["uu"][i], U), + ) + ( + self.aux_param["U0"], + self.aux_param["S0"], + self.parameters["beta"], + self.parameters["gamma"], + ) = (U0, S0, beta, gamma) + + ul_m, ul_v, t_uniq = calc_12_mom_labeling(self.data["ul"], self.t) + alpha = np.zeros(n_genes) + # let us only assume one alpha for each gene in all cells + if cores == 1: + for i in tqdm(range(n_genes), desc="estimating alpha"): + # for j in range(len(self.data['ul'][i])): + alpha[i] = fit_alpha_synthesis( + t_uniq, + ul_m[i], + self.parameters["beta"][i], + ) + else: + pool = ThreadPool(cores) + alpha = pool.starmap( + fit_alpha_synthesis, + zip( + itertools.repeat(t_uniq), + ul_m, + self.parameters["beta"], + ), + ) + pool.close() + pool.join() + alpha = np.array(alpha) + self.parameters["alpha"] = alpha + # self.parameters['alpha'] = self.fit_alpha_oneshot(self.t, self.data['ul'], self.parameters['beta'], clusters) + else: + if self._exist_data("ul") and self._exist_parameter("gamma"): + self.parameters["alpha"] = self.fit_alpha_oneshot( + self.t, + self.data["ul"], + self.parameters["gamma"], + clusters, + ) + elif self._exist_data("ul") and self._exist_data("uu"): + if one_shot_method in ["sci-fate", "sci_fate"]: + gamma, total0 = np.zeros(n_genes), np.zeros(n_genes) + for i in tqdm(range(n_genes), desc="estimating gamma"): + total = self.data["uu"][i] + self.data["ul"][i] + total0[i], gamma[i] = ( + np.mean(total), + solve_gamma( + np.max(self.t), + self.data["uu"][i], + total, + ), + ) + (self.aux_param["total0"], self.parameters["gamma"],) = ( + total0, + gamma, + ) + + ul_m, ul_v, t_uniq = calc_12_mom_labeling(self.data["ul"], self.t) + # let us only assume one alpha for each gene in all cells + alpha = np.zeros(n_genes) + if cores == 1: + for i in tqdm(range(n_genes), desc="estimating alpha"): + # for j in range(len(self.data['ul'][i])): + alpha[i] = fit_alpha_synthesis( + t_uniq, + ul_m[i], + self.parameters["gamma"][i], + ) # ul_m[i] / t_uniq + else: + pool = ThreadPool(cores) + alpha = pool.starmap( + fit_alpha_synthesis, + zip( + itertools.repeat(t_uniq), + ul_m, + self.parameters["gamma"], + ), + ) + pool.close() + pool.join() + alpha = np.array(alpha) + self.parameters["alpha"] = alpha + # self.parameters['alpha'] = self.fit_alpha_oneshot(self.t, self.data['ul'], self.parameters['gamma'], clusters) + elif one_shot_method == "combined": + self.parameters["alpha"] = ( + csr_matrix(self.data["ul"].shape) + if issparse(self.data["ul"]) + else np.zeros_like(self.data["ul"].shape) + ) + (t_uniq, gamma, gamma_k, gamma_intercept, gamma_r2, gamma_logLL,) = ( + np.unique(self.t), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + ) + U, S = ( + self.data["ul"], + self.data["uu"] + self.data["ul"], + ) + + if cores == 1: + for i in tqdm(range(n_genes), desc="estimating gamma"): + ( + gamma_k[i], + gamma_intercept[i], + _, + gamma_r2[i], + _, + gamma_logLL[i], + ) = self.fit_gamma_steady_state(U[i], S[i], False, None, perc_right) + ( + gamma[i], + self.parameters["alpha"][i], + ) = one_shot_gamma_alpha(gamma_k[i], t_uniq, U[i]) + else: + pool = ThreadPool(cores) + res1 = pool.starmap( + self.fit_gamma_steady_state, + zip( + U, + S, + itertools.repeat(False), + itertools.repeat(None), + itertools.repeat(perc_right), + ), + ) + + ( + gamma_k, + gamma_intercept, + _, + gamma_r2, + _, + gamma_logLL, + ) = zip(*res1) + (gamma_k, gamma_intercept, gamma_r2, gamma_logLL,) = ( + np.array(gamma_k), + np.array(gamma_intercept), + np.array(gamma_r2), + np.array(gamma_logLL), + ) + + res2 = pool.starmap( + one_shot_gamma_alpha, + zip(gamma_k, itertools.repeat(t_uniq), U), + ) + + (gamma, alpha) = zip(*res2) + (gamma, self.parameters["alpha"]) = ( + np.array(gamma), + np.array(alpha), + ) + + pool.close() + pool.join() + ( + self.parameters["gamma"], + self.aux_param["gamma_k"], + self.aux_param["gamma_intercept"], + self.aux_param["gamma_r2"], + self.aux_param["gamma_logLL"], + self.aux_param["alpha_r2"], + ) = ( + gamma, + gamma_k, + gamma_intercept, + gamma_r2, + gamma_logLL, + gamma_r2, + ) + elif self.model.lower() == "stochastic": + if np.all(self._exist_data("uu", "ul", "su", "sl")): + self.parameters["beta"] = np.ones(n_genes) + k, k_intercept, k_r2, k_logLL, bs, bf = ( + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + ) + U = self.data["uu"] + S = self.data["uu"] + self.data["ul"] + US = ( + self.data["us"] + if self.data["us"] is not None + else calc_2nd_moment(U.T, S.T, self.conn, mX=U.T, mY=S.T).T + ) + S2 = ( + self.data["s2"] + if self.data["s2"] is not None + else calc_2nd_moment(S.T, S.T, self.conn, mX=S.T, mY=S.T).T + ) + if cores == 1: + for i in tqdm( + range(n_genes), + desc="estimating beta and alpha for one-shot experiment", + ): + ( + k[i], + k_intercept[i], + _, + k_r2[i], + _, + k_logLL[i], + bs[i], + bf[i], + ) = self.fit_gamma_stochastic( + self.est_method, + U[i], + S[i], + US[i], + S2[i], + perc_left=perc_left, + perc_right=perc_right, + normalize=True, + ) + else: + pool = ThreadPool(cores) + res = pool.starmap( + self.fit_gamma_stochastic, + zip( + itertools.repeat(self.est_method), + U, + S, + US, + S2, + itertools.repeat(perc_left), + itertools.repeat(perc_right), + itertools.repeat(True), + ), + ) + pool.close() + pool.join() + ( + k, + k_intercept, + _, + k_r2, + _, + k_logLL, + bs, + bf, + ) = zip(*res) + (k, k_intercept, k_r2, k_logLL, bs, bf) = ( + np.array(k), + np.array(k_intercept), + np.array(k_r2), + np.array(k_logLL), + np.array(bs), + np.array(bf), + ) + beta, alpha0 = one_shot_gamma_alpha_matrix(k, t_uniq, U) + + self.parameters["beta"], self.aux_param["beta_k"] = ( + beta, + k, + ) + + U = self.data["uu"] + self.data["ul"] + S = U + self.data["su"] + self.data["sl"] + US = ( + self.data["us"] + if self.data["us"] is not None + else calc_2nd_moment(U.T, S.T, self.conn, mX=U.T, mY=S.T).T + ) + S2 = ( + self.data["s2"] + if self.data["s2"] is not None + else calc_2nd_moment(S.T, S.T, self.conn, mX=S.T, mY=S.T).T + ) + if cores == 1: + for i in tqdm( + range(n_genes), + desc="estimating gamma and alpha for one-shot experiment", + ): + ( + k[i], + k_intercept[i], + _, + k_r2[i], + _, + k_logLL[i], + bs[i], + bf[i], + ) = self.fit_gamma_stochastic( + self.est_method, + U[i], + S[i], + US[i], + S2[i], + perc_left=perc_left, + perc_right=perc_right, + normalize=True, + ) + else: + pool = ThreadPool(cores) + res = pool.starmap( + self.fit_gamma_stochastic, + zip( + itertools.repeat(self.est_method), + U, + S, + US, + S2, + itertools.repeat(perc_left), + itertools.repeat(perc_right), + itertools.repeat(True), + ), + ) + pool.close() + pool.join() + (k, k_intercept, _, k_r2, _, k_logLL, bs, bf) = zip(*res) + (k, k_intercept, k_r2, k_logLL, bs, bf) = ( + np.array(k), + np.array(k_intercept), + np.array(k_r2), + np.array(k_logLL), + np.array(bs), + np.array(bf), + ) + + gamma, alpha = one_shot_gamma_alpha_matrix(k, t_uniq, U) + ( + self.parameters["alpha"], + self.parameters["gamma"], + self.aux_param["gamma_k"], + self.aux_param["gamma_intercept"], + self.aux_param["gamma_r2"], + self.aux_param["gamma_logLL"], + self.aux_param["bs"], + self.aux_param["bf"], + ) = ( + (alpha + alpha0) / 2, + gamma, + k, + k_intercept, + k_r2, + k_logLL, + bs, + bf, + ) + elif np.all(self._exist_data("uu", "ul")): + if one_shot_method == "storm-csp": + gamma, gamma_r2, k = ( + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + ) + new_counts = self.data["new_counts"] + total_counts = self.data["total_counts"] + new_smooth_csp = self.data["new_smooth_csp"] + new_smooth = self.data['ul'] + total_smooth = self.data["ul"] + self.data["uu"] + for i in tqdm(range(n_genes), desc="estimating gamma via storm's csp model"): + ( + gamma[i], + gamma_r2[i], + k[i], + ) = self.fit_gamma_storm_csp( + new_counts[i], + total_counts[i], + new_smooth[i], + total_smooth[i], + t_uniq=t_uniq, + perc_left=perc_left, + perc_right=perc_right, + normalize=True, + ) + _, alpha = one_shot_gamma_alpha_matrix(k, t_uniq, new_smooth_csp) + ( + self.parameters["alpha"], + self.parameters["gamma"], + self.aux_param["gamma_k"], + self.aux_param["gamma_intercept"], + self.aux_param["gamma_r2"], + ) = ( + alpha, + gamma, + k, + np.zeros(n_genes), + gamma_r2, + ) + else: + k, k_intercept, k_r2, k_logLL, bs, bf = ( + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + ) + U = self.data["ul"] + S = self.data["ul"] + self.data["uu"] + US = ( + self.data["us"] + if self.data["us"] is not None + else calc_2nd_moment(U.T, S.T, self.conn, mX=U.T, mY=S.T).T + ) + S2 = ( + self.data["s2"] + if self.data["s2"] is not None + else calc_2nd_moment(S.T, S.T, self.conn, mX=S.T, mY=S.T).T + ) + if cores == 1: + for i in tqdm(range(n_genes), desc="estimating gamma"): + ( + k[i], + k_intercept[i], + _, + k_r2[i], + _, + k_logLL[i], + bs[i], + bf[i], + ) = self.fit_gamma_stochastic( + self.est_method, + U[i], + S[i], + US[i], + S2[i], + perc_left=perc_left, + perc_right=perc_right, + normalize=True, + ) + else: + pool = ThreadPool(cores) + res = pool.starmap( + self.fit_gamma_stochastic, + zip( + itertools.repeat(self.est_method), + U, + S, + US, + S2, + itertools.repeat(perc_left), + itertools.repeat(perc_right), + itertools.repeat(True), + ), + ) + pool.close() + pool.join() + (k, k_intercept, _, k_r2, _, k_logLL, bs, bf) = zip(*res) + (k, k_intercept, k_r2, k_logLL, bs, bf) = ( + np.array(k), + np.array(k_intercept), + np.array(k_r2), + np.array(k_logLL), + np.array(bs), + np.array(bf), + ) + + gamma, alpha = one_shot_gamma_alpha_matrix(k, t_uniq, U) + ( + self.parameters["alpha"], + self.parameters["gamma"], + self.aux_param["gamma_k"], + self.aux_param["gamma_intercept"], + self.aux_param["gamma_r2"], + self.aux_param["gamma_logLL"], + self.aux_param["bs"], + self.aux_param["bf"], + ) = ( + alpha, + gamma, + k, + k_intercept, + k_r2, + k_logLL, + bs, + bf, + ) + + self.fit_protein(intercept=intercept, perc_left=perc_left, perc_right=perc_right, cores=cores) + + def fit_mix_std_stm( + self, + intercept=False, + perc_left=None, + perc_right=5, + ): + n_genes = self.get_n_genes() + cores = max(1, int(self.cores)) + t_min, t_max = np.min(self.t), np.max(self.t) + if np.all(self._exist_data("ul", "uu", "su")): + gamma, beta, total, U = ( + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + np.zeros(n_genes), + ) + for i in tqdm( + range(n_genes), desc="solving gamma/beta" + ): # can also use the two extreme time points and apply sci-fate like approach. + tmp = ( + self.data["uu"][i, self.t == t_max] + + self.data["ul"][i, self.t == t_max] + + self.data["su"][i, self.t == t_max] + + self.data["sl"][i, self.t == t_max] + ) + total[i] = np.mean(tmp) + gamma[i] = solve_gamma( + t_max, + self.data["uu"][i, self.t == t_max] + self.data["su"][i, self.t == t_max], + tmp, + ) + # same for beta + tmp = self.data["uu"][i, self.t == t_max] + self.data["ul"][i, self.t == t_max] + U[i] = np.mean(tmp) + beta[i] = solve_gamma( + np.max(self.t), + self.data["uu"][i, self.t == t_max], + tmp, + ) + + ( + self.parameters["beta"], + self.parameters["gamma"], + self.aux_param["total0"], + self.aux_param["U0"], + ) = (beta, gamma, total, U) + # alpha estimation + self.parameters["alpha"] = self.solve_alpha_mix_std_stm( + self.t, self.data["ul"], self.parameters["beta"] + ) + elif np.all(self._exist_data("ul", "uu")): + n_genes = self.data["uu"].shape[0] # self.get_n_genes(data=U) + gamma, U = np.zeros(n_genes), np.zeros(n_genes) + for i in tqdm( + range(n_genes), desc="solving gamma, alpha" + ): # apply sci-fate like approach (can also use one-single time point to estimate gamma) + # tmp = self.data['uu'][i, self.t == 0] + self.data['ul'][i, self.t == 0] + tmp_ = self.data["uu"][i, self.t == t_max] + self.data["ul"][i, self.t == t_max] + + U[i] = np.mean(tmp_) + # gamma_1 = solve_gamma(np.max(self.t), self.data['uu'][i, self.t == 0], tmp) # steady state + gamma_2 = solve_gamma(t_max, self.data["uu"][i, self.t == t_max], tmp_) # stimulation + # gamma_3 = solve_gamma(np.max(self.t), self.data['uu'][i, self.t == np.max(self.t)], tmp) # sci-fate + gamma[i] = gamma_2 + # print('Steady state, stimulation, sci-fate like gamma values are ', gamma_1, '; ', gamma_2, '; ', gamma_3) + (self.parameters["gamma"], self.aux_param["U0"], self.parameters["beta"],) = ( + gamma, + U, + np.ones(gamma.shape), + ) + # alpha estimation + self.parameters["alpha"] = self.solve_alpha_mix_std_stm( + self.t, self.data["ul"], self.parameters["gamma"] + + self.fit_protein(intercept=intercept, perc_left=perc_left, perc_right=perc_right, cores=cores) + def fit_gamma_steady_state(self, u, s, intercept=True, perc_left=None, perc_right=5, normalize=True): """Estimate gamma using linear regression based on the steady state assumption. From 3a5621792c3c37dedccecb1aff412304125d2791 Mon Sep 17 00:00:00 2001 From: sichao Date: Wed, 26 Jul 2023 14:17:23 -0400 Subject: [PATCH 28/31] create kinetic params estimation class --- dynamo/estimation/csc/velocity.py | 1 + dynamo/tools/dynamics.py | 4116 +++++++++++++++-------------- 2 files changed, 2061 insertions(+), 2056 deletions(-) diff --git a/dynamo/estimation/csc/velocity.py b/dynamo/estimation/csc/velocity.py index 498a30aaf..85229499c 100755 --- a/dynamo/estimation/csc/velocity.py +++ b/dynamo/estimation/csc/velocity.py @@ -2637,6 +2637,7 @@ def fit_mix_std_stm( # alpha estimation self.parameters["alpha"] = self.solve_alpha_mix_std_stm( self.t, self.data["ul"], self.parameters["gamma"] + ) self.fit_protein(intercept=intercept, perc_left=perc_left, perc_right=perc_right, cores=cores) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 5dbf54737..4cc08fd09 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -1064,119 +1064,8 @@ def calculate_vels( class TwoStepKineticsDynamics(KineticsDynamics): def estimate_params_utils(self, params_est_kwargs): - ( - subset_adata, - data_type, - return_ntr, - ) = params_est_kwargs["subset_adata"], params_est_kwargs["data_type"], params_est_kwargs["return_ntr"] - time = subset_adata.obs[self.tkey].astype("float").values - if self.has_splicing: - layers = ( - ["M_u", "M_s", "M_t", "M_n"] - if ("M_u" in subset_adata.layers.keys() and data_type == "smoothed") - else ["X_u", "X_s", "X_t", "X_n"] - ) - U, S, Total, New = ( - subset_adata.layers[layers[0]].T, - subset_adata.layers[layers[1]].T, - subset_adata.layers[layers[2]].T, - subset_adata.layers[layers[3]].T, - ) - US, S2 = ( - subset_adata.layers["M_us"].T, - subset_adata.layers["M_ss"].T, - ) - # gamma, gamma_r2 = lin_reg_gamma_synthesis(U, Ul, time, perc_right=100) - ( - gamma_k, - gamma_b, - gamma_all_r2, - gamma_all_logLL, - ) = fit_slope_stochastic(S, U, US, S2, perc_left=None, perc_right=100) - ( - gamma, - gamma_r2, - X_data, - mean_R2, - K_fit, - ) = lin_reg_gamma_synthesis(Total, New, time, perc_right=100) - - k = 1 - np.exp(-gamma[:, None] * time[None, :]) - beta = gamma / gamma_k # gamma_k = gamma / beta - - Estm_df = { - "alpha": csr_matrix(gamma[:, None]).multiply(New).multiply(1 / k), - "beta": beta, - "gamma_k": gamma_k, - "gamma_b": gamma_b, - "gamma_k_r2": gamma_all_r2, - "gamma_logLL": gamma_all_logLL, - "gamma": gamma, - "gamma_r2": gamma_r2, - "mean_R2": mean_R2, - } - half_life = np.log(2) / gamma - cost, logLL, _param_ranges, X_data, X_fit_data = ( - None, - None, - None, - X_data, - K_fit, - ) - - return ( - Estm_df, - half_life, - cost, - logLL, - _param_ranges, - X_data, - X_fit_data, - ) - else: - layers = ( - ["M_t", "M_n"] - if ("M_t" in subset_adata.layers.keys() and data_type == "smoothed") - else ["X_t", "X_n"] - ) - Total, New = ( - subset_adata.layers[layers[0]].T, - subset_adata.layers[layers[1]].T, - ) - ( - gamma, - gamma_r2, - X_data, - mean_R2, - K_fit, - ) = lin_reg_gamma_synthesis(Total, New, time, perc_right=100) - - k = 1 - np.exp(-gamma[:, None] * time[None, :]) - Estm_df = { - "alpha": csr_matrix(gamma[:, None]).multiply(New).multiply(1 / k), - "gamma": gamma, - "gamma_k": gamma, # required for phase_potrait - "gamma_r2": gamma_r2, - "mean_R2": mean_R2, - } - half_life = np.log(2) / gamma - cost, logLL, _param_ranges, X_data, X_fit_data = ( - None, - None, - None, - X_data, - K_fit, - ) + pass - return ( - Estm_df, - half_life, - cost, - logLL, - _param_ranges, - X_data, - X_fit_data, - ) class KineticsStormDynamics(LabeledDynamics): """Stochastic transient dynamics for the kinetic experiment with kinetic assumption. This includes three stochastic @@ -1184,157 +1073,7 @@ class KineticsStormDynamics(LabeledDynamics): transcription, splicing, and spliced mRNA degradation. And in Model 3, we considered the switching of gene expression states, transcription in the active state, and mRNA degradation.""" def estimate_params_utils(self, params_est_kwargs): - ( - subset_adata, - data_type, - return_ntr, - ) = params_est_kwargs["subset_adata"], params_est_kwargs["data_type"], params_est_kwargs["return_ntr"] - time = subset_adata.obs[self.tkey].astype("float").values - if self.has_splicing: - # Initialization based on the steady-state assumption - layers_smoothed = ["M_u", "M_s", "M_t", "M_n"] - U_smoothed, S_smoothed, Total_smoothed, New_smoothed = ( - subset_adata.layers[layers_smoothed[0]].T, - subset_adata.layers[layers_smoothed[1]].T, - subset_adata.layers[layers_smoothed[2]].T, - subset_adata.layers[layers_smoothed[3]].T, - ) - - US_smoothed, S2_smoothed = ( - subset_adata.layers["M_us"].T, - subset_adata.layers["M_ss"].T, - ) - (gamma_k, _, _, _,) = fit_slope_stochastic(S_smoothed, U_smoothed, US_smoothed, S2_smoothed, - perc_left=None, perc_right=5) - (gamma_init, _, _, _, _) = lin_reg_gamma_synthesis(Total_smoothed, New_smoothed, time, perc_right=5) - beta_init = gamma_init / gamma_k # gamma_k = gamma / beta - - # Read raw counts - layers_raw = ["ul", "sl"] - UL_raw, SL_raw = ( - subset_adata.layers[layers_raw[0]].T, - subset_adata.layers[layers_raw[1]].T, - ) - - # Read smoothed values based CSP type distribution for cell-specific parameter inference - UL_smoothed_CSP, SL_smoothed_CSP = ( - subset_adata.layers['M_CSP_ul'].T, - subset_adata.layers['M_CSP_sl'].T, - ) - - # Parameters inference based on maximum likelihood estimation - cell_total = subset_adata.obs['initial_cell_size'].astype("float").values - # Independent cell-specific Poisson - (gamma_s, gamma_r2, beta, gamma_t, gamma_r2_raw, alpha) = storm.mle_independent_cell_specific_poisson \ - (UL_raw, SL_raw, time, gamma_init, beta_init, cell_total, Total_smoothed, S_smoothed) - gamma_k = gamma_s / beta - gamma_b = np.zeros_like(gamma_k) - - # Cell specific parameters (fixed gamma_s) - alpha, beta = storm.cell_specific_alpha_beta(UL_smoothed_CSP, SL_smoothed_CSP, time, gamma_s, beta) - - # # Cell specific parameters(fixed gamma_t) - # k = 1 - np.exp(-gamma_t[:, None] * time[None, :]) - # alpha = csr_matrix(gamma_t[:, None]).multiply(UL_smoothed_CSP+SL_smoothed_CSP).multiply(1 / k) - - Estm_df = { - "alpha": alpha, - "beta": beta, - "gamma_k": gamma_k, - "gamma_b": gamma_b, - # "gamma_k_r2": gamma_all_r2, - # "gamma_logLL": gamma_all_logLL, - "gamma": gamma_s, - "gamma_r2": gamma_r2, - # "mean_R2": mean_R2, - "gamma_t": gamma_t, - "gamma_r2_raw": gamma_r2_raw, - } - half_life = np.log(2) / gamma_s - cost, logLL, _param_ranges, X_data, X_fit_data = ( - None, - None, - None, - None, - None, - ) - - return ( - Estm_df, - half_life, - cost, - logLL, - _param_ranges, - X_data, - X_fit_data, - ) - else: - # Initialization based on the steady-state assumption - layers_smoothed = ["M_t", "M_n"] - Total_smoothed, New_smoothed = ( - subset_adata.layers[layers_smoothed[0]].T, - subset_adata.layers[layers_smoothed[1]].T, - ) - (gamma_init, _, _, _, _,) = lin_reg_gamma_synthesis(Total_smoothed, New_smoothed, time, - perc_right=5) - - # Read raw counts - layers_raw = ["total", "new"] - Total_raw, New_raw = ( - subset_adata.layers[layers_raw[0]].T, - subset_adata.layers[layers_raw[1]].T, - ) - - # Read smoothed values based CSP type distribution for cell-specific parameter inference - layers_smoothed_CSP = ["M_CSP_t", "M_CSP_n"] - Total_smoothed_CSP, New_smoothed_CSP = ( - subset_adata.layers[layers_smoothed_CSP[0]].T, - subset_adata.layers[layers_smoothed_CSP[1]].T, - ) - - # Parameters inference based on maximum likelihood estimation - cell_total = subset_adata.obs['initial_cell_size'].astype("float").values - - if "storm-csp" == self.est_method: - gamma, gamma_r2, gamma_r2_raw, alpha = storm.mle_cell_specific_poisson(New_raw, time, - gamma_init, cell_total) - elif "storm-cszip" == self.est_method: - gamma, prob_off, gamma_r2, gamma_r2_raw, alpha = storm.mle_cell_specific_zero_inflated_poisson( - New_raw, time, gamma_init, cell_total) - alpha = alpha * (1 - prob_off) # gene-wise alpha - else: - raise NotImplementedError("This method has not been implemented.") - - k = 1 - np.exp(-gamma[:, None] * time[None, :]) - alpha = csr_matrix(gamma[:, None]).multiply(New_smoothed_CSP).multiply(1 / k) # gene-cell-wise alpha - - Estm_df = { - "alpha": alpha, - "gamma": gamma, - "gamma_k": gamma, # required for phase_potrait - "gamma_r2": gamma_r2, - "gamma_r2_raw": gamma_r2_raw, - # "mean_R2": mean_R2, - "prob_off": prob_off if "cszip" in self.est_method else None - } - half_life = np.log(2) / gamma - cost, logLL, _param_ranges, X_data, X_fit_data = ( - None, - None, - None, - None, # X_data, - None, # K_fit, - ) - - return ( - Estm_df, - half_life, - cost, - logLL, - _param_ranges, - X_data, - X_fit_data, - ) + pass def calculate_vel_U( self, @@ -1404,1971 +1143,2236 @@ def calculate_vels( class DirectKineticsDynamics(KineticsDynamics): def estimate_params_utils(self, params_est_kwargs): - ( - subset_adata, - data_type, - return_ntr, - ) = params_est_kwargs["subset_adata"], params_est_kwargs["data_type"], params_est_kwargs["return_ntr"] - has_switch = True - param_rngs = {} - time = subset_adata.obs[self.tkey].astype("float").values - if self.has_splicing and self.splicing_labeling: - layers = ( - ["M_ul", "M_sl", "M_uu", "M_su"] - if ("M_ul" in subset_adata.layers.keys() and data_type == "smoothed") - else ["X_ul", "X_sl", "X_uu", "X_su"] - ) - - if self.model.lower() in ["deterministic", "stochastic"]: - layer_u = "M_ul" if ("M_ul" in subset_adata.layers.keys() and data_type == "smoothed") else "X_ul" - layer_s = "M_sl" if ("M_ul" in subset_adata.layers.keys() and data_type == "smoothed") else "X_sl" + pass - X, X_raw = prepare_data_has_splicing( - subset_adata, - subset_adata.var.index, - time, - layer_u=layer_u, - layer_s=layer_s, - total_layers=layers, - ) - elif self.model.startswith("mixture"): - X, _, X_raw = prepare_data_deterministic( - subset_adata, - subset_adata.var.index, - time, - layers=layers, - total_layers=layers, - ) - if self.model.lower() == "deterministic": - X = [X[i][[0, 1], :] for i in range(len(X))] - _param_ranges = { - "alpha": [0, 1000], - "beta": [0, 1000], - "gamma": [0, 1000], - } - x0 = {"u0": [0, 1000], "s0": [0, 1000]} - Est, _ = Estimation_DeterministicKin, Deterministic - elif self.model.lower() == "stochastic": - x0 = { - "u0": [0, 1000], - "s0": [0, 1000], - "uu0": [0, 1000], - "ss0": [0, 1000], - "us0": [0, 1000], - } - - if has_switch: - _param_ranges = { - "a": [0, 1000], - "b": [0, 1000], - "alpha_a": [0, 1000], - "alpha_i": 0, - "beta": [0, 1000], - "gamma": [0, 1000], - } - Est, _ = Estimation_MomentKin, Moments - else: - _param_ranges = { - "alpha": [0, 1000], - "beta": [0, 1000], - "gamma": [0, 1000], - } +class DegradationDynamics(LabeledDynamics): + """Dynamics model for the degradation experiment. In degradation experiment, samples are chased after an extended + 4sU (or other nucleotide analog) labeling period and the wash-out to observe the decay of the abundance of the + (labeled) unspliced and spliced RNA decay over time.""" + def calculate_vel_U( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return np.nan - Est, _ = ( - Estimation_MomentKinNoSwitch, - Moments_NoSwitching, - ) - elif self.model.lower() == "mixture": - _param_ranges = { - "alpha": [0, 1000], - "alpha_2": [0, 0], - "beta": [0, 1000], - "gamma": [0, 1000], - } - x0 = { - "ul0": [0, 0], - "sl0": [0, 0], - "uu0": [0, 1000], - "su0": [0, 1000], - } + def calculate_vel_S( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return vel.vel_s(U, S) - Est = Mixture_KinDeg_NoSwitching(Deterministic(), Deterministic()) - elif self.model.lower() == "mixture_deterministic_stochastic": - X, X_raw = prepare_data_mix_has_splicing( - subset_adata, - subset_adata.var.index, - time, - layer_u=layers[2], - layer_s=layers[3], - layer_ul=layers[0], - layer_sl=layers[1], - total_layers=layers, - mix_model_indices=[0, 1, 5, 6, 7, 8, 9], - ) + def calculate_vel_N( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return np.nan - _param_ranges = { - "alpha": [0, 1000], - "alpha_2": [0, 0], - "beta": [0, 1000], - "gamma": [0, 1000], - } - x0 = { - "ul0": [0, 0], - "sl0": [0, 0], - "u0": [0, 1000], - "s0": [0, 1000], - "uu0": [0, 1000], - "ss0": [0, 1000], - "us0": [0, 1000], - } - Est = Mixture_KinDeg_NoSwitching(Deterministic(), Moments_NoSwitching()) - elif self.model.lower() == "mixture_stochastic_stochastic": - _param_ranges = { - "alpha": [0, 1000], - "alpha_2": [0, 0], - "beta": [0, 1000], - "gamma": [0, 1000], - } - X, X_raw = prepare_data_mix_has_splicing( - subset_adata, - subset_adata.var.index, - time, - layer_u=layers[2], - layer_s=layers[3], - layer_ul=layers[0], - layer_sl=layers[1], - total_layers=layers, - mix_model_indices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], - ) - x0 = { - "ul0": [0, 1000], - "sl0": [0, 1000], - "ul_ul0": [0, 1000], - "sl_sl0": [0, 1000], - "ul_sl0": [0, 1000], - "u0": [0, 1000], - "s0": [0, 1000], - "uu0": [0, 1000], - "ss0": [0, 1000], - "us0": [0, 1000], - } - Est = Mixture_KinDeg_NoSwitching(Moments_NoSwitching(), Moments_NoSwitching()) - else: - raise NotImplementedError( - f"model {self.model} with kinetic assumption is not implemented. " - f"current supported models for kinetics experiments include: stochastic, deterministic, mixture," - f"mixture_deterministic_stochastic or mixture_stochastic_stochastic" - ) - else: - total_layer = "M_t" if ("M_t" in subset_adata.layers.keys() and data_type == "smoothed") else "X_total" + def calculate_vel_T( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return np.nan - if self.model.lower() in ["deterministic", "stochastic"]: - layer = "M_n" if ("M_n" in subset_adata.layers.keys() and data_type == "smoothed") else "X_new" - X, X_raw = prepare_data_no_splicing( - subset_adata, - subset_adata.var.index, - time, - layer=layer, - total_layer=total_layer, - ) - elif self.model.lower().startswith("mixture"): - layers = ( - ["M_n", "M_t"] - if ("M_n" in subset_adata.layers.keys() and data_type == "smoothed") - else ["X_new", "X_total"] - ) - X, _, X_raw = prepare_data_deterministic( - subset_adata, - subset_adata.var.index, - time, - layers=layers, - total_layers=total_layer, - ) +class MixStdStmDynamics(LabeledDynamics): + """Dynamics model for the mixed steady state and stimulation labeling (mix_std_stm) experiment.""" + def calculate_vel_U( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return self.alpha1 - csr_matrix(self.beta[:, None]).multiply(U) - if self.model.lower() == "deterministic": - X = [X[i][0, :] for i in range(len(X))] - _param_ranges = { - "alpha": [0, 1000], - "gamma": [0, 1000], - } - x0 = {"u0": [0, 1000]} - Est, _ = ( - Estimation_DeterministicKinNosp, - Deterministic_NoSplicing, - ) - elif self.model.lower() == "stochastic": - x0 = { - "u0": [0, 1000], - "uu0": [0, 1000], - } - if has_switch: - _param_ranges = { - "a": [0, 1000], - "b": [0, 1000], - "alpha_a": [0, 1000], - "alpha_i": 0, - "gamma": [0, 1000], - } - Est, _ = Estimation_MomentKinNosp, Moments_Nosplicing - else: - _param_ranges = { - "alpha": [0, 1000], - "gamma": [0, 1000], - } - Est, _ = ( - Estimation_MomentKinNoSwitchNoSplicing, - Moments_NoSwitchingNoSplicing, - ) - elif self.model.lower() == "mixture": - _param_ranges = { - "alpha": [0, 1000], - "alpha_2": [0, 0], - "gamma": [0, 1000], - } - x0 = {"u0": [0, 0], "o0": [0, 1000]} - Est = Mixture_KinDeg_NoSwitching(Deterministic_NoSplicing(), Deterministic_NoSplicing()) - elif self.model.lower() == "mixture_deterministic_stochastic": - X, X_raw = prepare_data_mix_no_splicing( - subset_adata, - subset_adata.var.index, - time, - layer_n=layers[0], - layer_t=layers[1], - total_layer=total_layer, - mix_model_indices=[0, 2, 3], - ) + def calculate_vel_S( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return vel.vel_s(U, S) - _param_ranges = { - "alpha": [0, 1000], - "alpha_2": [0, 0], - "gamma": [0, 1000], - } - x0 = {"u0": [0, 1000], "o0": [0, 1000], "oo0": [0, 1000]} - Est = Mixture_KinDeg_NoSwitching( - Deterministic_NoSplicing(), - Moments_NoSwitchingNoSplicing(), - ) - elif self.model.lower() == "mixture_stochastic_stochastic": - X, X_raw = prepare_data_mix_no_splicing( - subset_adata, - subset_adata.var.index, - time, - layer_n=layers[0], - layer_t=layers[1], - total_layer=total_layer, - mix_model_indices=[0, 1, 2, 3], - ) - - _param_ranges = { - "alpha": [0, 1000], - "alpha_2": [0, 0], - "gamma": [0, 1000], - } - x0 = { - "u0": [0, 1000], - "uu0": [0, 1000], - "o0": [0, 1000], - "oo0": [0, 1000], - } - Est = Mixture_KinDeg_NoSwitching( - Moments_NoSwitchingNoSplicing(), - Moments_NoSwitchingNoSplicing(), - ) - else: - raise NotImplementedError( - f"model {self.model} with kinetic assumption is not implemented. " - f"current supported models for kinetics experiments include: stochastic, deterministic, " - f"mixture, mixture_deterministic_stochastic or mixture_stochastic_stochastic" - ) - _param_ranges = update_dict(_param_ranges, param_rngs) - x0_ = np.vstack([ran for ran in x0.values()]).T + def calculate_vel_N( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return self.alpha1 - csr_matrix(self.gamma[:, None]).multiply(self.u_new) - n_genes = subset_adata.n_vars - cost, logLL = np.zeros(n_genes), np.zeros(n_genes) - all_keys = list(_param_ranges.keys()) + list(x0.keys()) - all_keys = [cur_key for cur_key in all_keys if cur_key != "alpha_i"] - half_life, Estm = np.zeros(n_genes), [None] * n_genes - X_data, X_fit_data = [None] * n_genes, [None] * n_genes - if self.experiment_type: - popt = [None] * n_genes + def calculate_vel_T( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return self.alpha1 - csr_matrix(self.gamma[:, None]).multiply(T) - main_debug("model: %s, experiment_type: %s" % (self.model, self.experiment_type)) - for i_gene in tqdm(range(n_genes), desc="estimating kinetic-parameters using kinetic model"): - if self.model.lower().startswith("mixture"): - estm = Est - if self.model.lower() == "mixture": - cur_X_data = np.vstack([X[i_layer][i_gene] for i_layer in range(len(X))]) - if issparse(X_raw[0]): - cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene].A for i_layer in range(len(X))]) - else: - cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene] for i_layer in range(len(X))]) - else: - cur_X_data = X[i_gene] - cur_X_raw = X_raw[i_gene] + def calculate_vels( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Tuple: + """Override the velocity calculation function to calculate extra parameters u_new and alpha1.""" + if self.has_splicing: + u0, self.u_new, self.alpha1 = solve_alpha_2p_mat( + t0=np.max(self.t) - self.t, + t1=self.t, + alpha0=self.alpha[0], + beta=self.beta, + u1=N, + ) + vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + else: + u0, self.u_new, self.alpha1 = solve_alpha_2p_mat( + t0=np.max(self.t) - self.t, + t1=self.t, + alpha0=self.alpha[0], + beta=self.gamma, + u1=N, + ) + vel_U, vel_S = np.nan, np.nan + vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + return vel_U, vel_S, vel_N, vel_T - if issparse(cur_X_raw[0, 0]): - cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) - _, cost[i_gene] = estm.auto_fit(np.unique(time), cur_X_data) - ( - model_1, - model_2, - kinetic_parameters, - mix_x0, - ) = estm.export_dictionary().values() - tmp = list(kinetic_parameters.values()) - tmp.extend(mix_x0) - Estm[i_gene] = tmp - else: - cur_X_data, cur_X_raw = X[i_gene], X_raw[i_gene] +class MixKineticsDynamics(LabeledDynamics): + """Dynamics model for two mix experiment type: mix_kin_deg and mix_pulse_chase.""" + def calculate_vel_U( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return vel.vel_u(U, repeat=True) - if self.has_splicing: - alpha0 = guestimate_alpha(np.sum(cur_X_data, 0), np.unique(time)) - else: - alpha0 = ( - guestimate_alpha(cur_X_data, np.unique(time)) - if cur_X_data.ndim == 1 - else guestimate_alpha(cur_X_data[0], np.unique(time)) - ) + def calculate_vel_S( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return vel.vel_s(U, S) - if self.model.lower() == "stochastic": - _param_ranges.update({"alpha_a": [0, alpha0 * 10]}) - elif self.model.lower() == "deterministic": - _param_ranges.update({"alpha": [0, alpha0 * 10]}) - param_ranges = [ran for ran in _param_ranges.values()] + def calculate_vel_N( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return vel.vel_u(N, repeat=True) - estm = Est(*param_ranges, x0=x0_) if "x0" in inspect.getfullargspec(Est) else Est(*param_ranges) - _, cost[i_gene] = estm.fit_lsq(np.unique(time), cur_X_data, **self.est_kwargs) - if self.model.lower() == "deterministic": - Estm[i_gene] = estm.export_parameters() - else: - tmp = np.ma.array(estm.export_parameters(), mask=False) - tmp.mask[3] = True - Estm[i_gene] = tmp.compressed() + def calculate_vel_T( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Union[ndarray, csr_matrix]: + return vel.vel_u(T) if not self.has_splicing and self.NTR_vel else vel.vel_u(T, repeat=True) - if issparse(cur_X_raw[0, 0]): - cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) + def calculate_vels( + self, + vel: Velocity, + U: Union[ndarray, csr_matrix], + S: Union[ndarray, csr_matrix], + N: Union[ndarray, csr_matrix], + T: Union[ndarray, csr_matrix], + ) -> Tuple: + """Override the velocity calculation function to reset beta when the data contains splicing information.""" + if self.has_splicing: + vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) + vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) + vel.parameters["beta"] = self.gamma + else: + vel_U, vel_S = np.nan, np.nan + vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) + vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) + return vel_U, vel_S, vel_N, vel_T - X_data[i_gene] = cur_X_data - if self.model.lower().startswith("mixture"): - X_fit_data[i_gene] = estm.simulator.x.T - X_fit_data[i_gene][estm.model1.n_species:] *= estm.scale - else: - if hasattr(estm, "extract_data_from_simulator"): - X_fit_data[i_gene] = estm.extract_data_from_simulator() - else: - X_fit_data[i_gene] = estm.simulator.x.T - half_life[i_gene] = np.log(2) / Estm[i_gene][-1] +# TODO: rename this later +def dynamics_wrapper( + adata: AnnData, + filter_gene_mode: Literal["final", "basic", "no"] = "final", + use_smoothed: bool = True, + assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", + assumption_protein: Literal["ss"] = "ss", + model: Literal["auto", "deterministic", "stochastic"] = "auto", + est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", + NTR_vel: bool = False, + group: Optional[str] = None, + protein_names: Optional[List[str]] = None, + concat_data: bool = False, + log_unnormalized: bool = True, + one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", + fraction_for_deg: bool = False, + re_smooth: bool = False, + sanity_check: bool = False, + del_2nd_moments: Optional[bool] = None, + cores: int = 1, + tkey: str = None, + **est_kwargs, +) -> AnnData: + """Predict the model and assumption if they are set as auto. Run corresponding Dynamics methods according to the + experiment type. More information can be found in the class BaseDynamics.""" + if "pp" not in adata.uns_keys(): + raise ValueError(f"\nPlease run `dyn.pp.receipe_monocle(adata)` before running this function!") + if model.lower() == "auto": + model = "stochastic" + model_was_auto = True + else: + model = model + model_was_auto = False - if self.model.lower().startswith("mixture"): - species = [0, 1, 2, 3] if self.has_splicing else [0, 1] - gof = GoodnessOfFit(estm.export_model(), params=estm.export_parameters()) - gof.prepare_data(time, cur_X_raw.T, species=species, normalize=True) - else: - gof = GoodnessOfFit( - estm.export_model(), - params=estm.export_parameters(), - x0=estm.simulator.x0, - ) - gof.prepare_data(time, cur_X_raw.T, normalize=True) + (experiment_type, has_splicing, has_labeling, splicing_labeling, has_protein,) = ( + adata.uns["pp"]["experiment_type"], + adata.uns["pp"]["has_splicing"], + adata.uns["pp"]["has_labeling"], + adata.uns["pp"]["splicing_labeling"], + adata.uns["pp"]["has_protein"], + ) - logLL[i_gene] = gof.calc_mean_squared_deviation() # .calc_gaussian_loglikelihood() - - Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) - - return Estm_df, half_life, cost, logLL, _param_ranges, X_data, X_fit_data + (NTR_vel, assump_mRNA) = get_auto_assump_mRNA( + subset_adata=adata, + has_splicing=has_splicing, + has_labeling=has_labeling, + use_moments=use_smoothed, + tkey=tkey, + NTR_vel=NTR_vel, + ) + if assumption_mRNA.lower() == "auto": + assumption_mRNA = assump_mRNA + if experiment_type.lower() == "conventional": + assumption_mRNA = "ss" + elif experiment_type.lower() in ["mix_pulse_chase", "deg", "kin"]: + assumption_mRNA = "kinetic" + if model.lower() == "stochastic" and experiment_type.lower() not in [ + "conventional", + "kinetics", + "degradation", + "kin", + "deg", + "one-shot", + ]: + """ + # temporially convert to deterministic model as moment model for mix_std_stm + and other types of labeling experiment is ongoing.""" -class DegradationDynamics(LabeledDynamics): - """Dynamics model for the degradation experiment. In degradation experiment, samples are chased after an extended - 4sU (or other nucleotide analog) labeling period and the wash-out to observe the decay of the abundance of the - (labeled) unspliced and spliced RNA decay over time.""" - def estimate_params_utils(self, params_est_kwargs): - ( - subset_adata, - data_type, - return_ntr, - ) = params_est_kwargs["subset_adata"], params_est_kwargs["data_type"], params_est_kwargs["return_ntr"] - has_switch = True - param_rngs = {} - time = subset_adata.obs[self.tkey].astype("float").values - if self.has_splicing and self.splicing_labeling: - layers = ( - ["M_ul", "M_sl", "M_uu", "M_su"] - if ("M_ul" in subset_adata.layers.keys() and data_type == "smoothed") - else ["X_ul", "X_sl", "X_uu", "X_su"] - ) + model = "deterministic" - if self.model.lower() in ["deterministic", "stochastic"]: - layer_u = "M_ul" if ("M_ul" in subset_adata.layers.keys() and data_type == "smoothed") else "X_ul" - layer_s = "M_sl" if ("M_sl" in subset_adata.layers.keys() and data_type == "smoothed") else "X_sl" + if model_was_auto and experiment_type.lower() in [ + "kinetic", + "kin", + "degradation", + "deg", + ]: + model = "deterministic" - X, X_raw = prepare_data_has_splicing( - subset_adata, - subset_adata.var.index, - time, - layer_u=layer_u, - layer_s=layer_s, - total_layers=layers, - return_ntr=return_ntr, - ) - elif self.model.lower().startswith("mixture"): - X, _, X_raw = prepare_data_deterministic( - subset_adata, - subset_adata.var.index, - time, - layers=layers, - total_layers=layers, - return_ntr=return_ntr, - ) + dynamics_kwargs = { + "adata": adata, + "filter_gene_mode": filter_gene_mode, + "use_smoothed": use_smoothed, + "assumption_mRNA": assumption_mRNA, + "assumption_protein": assumption_protein, + "model": model, + "model_was_auto": model_was_auto, + "experiment_type": experiment_type, + "has_splicing": has_splicing, + "has_labeling": has_labeling, + "splicing_labeling": splicing_labeling, + "has_protein": has_protein, + "est_method": est_method, + "NTR_vel": NTR_vel, + "group": group, + "protein_names": protein_names, + "concat_data": concat_data, + "log_unnormalized": log_unnormalized, + "one_shot_method": one_shot_method, + "fraction_for_deg": fraction_for_deg, + "re_smooth": re_smooth, + "sanity_check": sanity_check, + "del_2nd_moments": del_2nd_moments, + "cores": cores, + "tkey": tkey, + "est_kwargs": est_kwargs, + } - if self.model.lower() == "deterministic": - X = [X[i][[0, 1], :] for i in range(len(X))] - _param_ranges = { - "beta": [0, 1000], - "gamma": [0, 1000], - } - x0 = { - "u0": [0, 1000], - "s0": [0, 1000], - } - Est, _ = Estimation_DeterministicDeg, Deterministic - elif self.model.lower() == "stochastic": - _param_ranges = { - "beta": [0, 1000], - "gamma": [0, 1000], - } - x0 = { - "u0": [0, 1000], - "s0": [0, 1000], - "uu0": [0, 1000], - "ss0": [0, 1000], - "us0": [0, 1000], - } - Est, _ = Estimation_MomentDeg, Moments_NoSwitching - else: - raise NotImplementedError( - f"model {self.model} with kinetic assumption is not implemented. " - f"current supported models for degradation experiment include: " - f"stochastic, deterministic." - ) + if experiment_type == "conventional": + estimator = SplicedDynamics(dynamics_kwargs) + elif experiment_type in ["one-shot", "one_shot"]: + estimator = OneShotDynamics(dynamics_kwargs) + elif experiment_type == "kin": + if assumption_mRNA == "ss": + estimator = SSKineticsDynamics(dynamics_kwargs) + elif assumption_mRNA == "kinetic": + if model == 'deterministic': + estimator = KineticsDynamics(dynamics_kwargs) + elif model == 'stochastic': + estimator = KineticsStormDynamics(dynamics_kwargs) else: - total_layer = "M_t" if ("M_t" in subset_adata.layers.keys() and data_type == "smoothed") else "X_total" - - layer = "M_n" if ("M_n" in subset_adata.layers.keys() and data_type == "smoothed") else "X_new" - X, X_raw = prepare_data_no_splicing( - subset_adata, - subset_adata.var.index, - time, - layer=layer, - total_layer=total_layer, - return_ntr=return_ntr, - ) - - if self.model.lower() == "deterministic": - X = [X[i][0, :] for i in range(len(X))] - _param_ranges = { - "gamma": [0, 10], - } - x0 = {"u0": [0, 1000]} - Est, _ = ( - Estimation_DeterministicDegNosp, - Deterministic_NoSplicing, - ) - elif self.model.lower() == "stochastic": - _param_ranges = { - "gamma": [0, 10], - } - x0 = {"u0": [0, 1000], "uu0": [0, 1000]} - Est, _ = Estimation_MomentDegNosp, Moments_NoSwitchingNoSplicing - else: - raise NotImplementedError( - f"model {self.model} with kinetic assumption is not implemented. " - f"current supported models for degradation experiment include: " - f"stochastic, deterministic.") - _param_ranges = update_dict(_param_ranges, param_rngs) - x0_ = np.vstack([ran for ran in x0.values()]).T - - n_genes = subset_adata.n_vars - cost, logLL = np.zeros(n_genes), np.zeros(n_genes) - all_keys = list(_param_ranges.keys()) + list(x0.keys()) - all_keys = [cur_key for cur_key in all_keys if cur_key != "alpha_i"] - half_life, Estm = np.zeros(n_genes), [None] * n_genes - X_data, X_fit_data = [None] * n_genes, [None] * n_genes - if self.experiment_type: - popt = [None] * n_genes - - main_debug("model: %s, experiment_type: %s" % (self.model, self.experiment_type)) - for i_gene in tqdm(range(n_genes), desc="estimating kinetic-parameters using kinetic model"): - if self.model.lower().startswith("mixture"): - estm = Est - if self.model.lower() == "mixture": - cur_X_data = np.vstack([X[i_layer][i_gene] for i_layer in range(len(X))]) - if issparse(X_raw[0]): - cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene].A for i_layer in range(len(X))]) - else: - cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene] for i_layer in range(len(X))]) - else: - cur_X_data = X[i_gene] - cur_X_raw = X_raw[i_gene] + raise NotImplementedError("This method has not been implemented.") + elif experiment_type == "deg": + estimator = DegradationDynamics(dynamics_kwargs) + elif experiment_type == "mix_std_stm": + estimator = MixStdStmDynamics(dynamics_kwargs) + elif experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: + estimator = MixKineticsDynamics(dynamics_kwargs) + else: + raise NotImplementedError("This method has not been implemented.") + adata = estimator.estimate() + return adata - if issparse(cur_X_raw[0, 0]): - cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) - _, cost[i_gene] = estm.auto_fit(np.unique(time), cur_X_data) - ( - model_1, - model_2, - kinetic_parameters, - mix_x0, - ) = estm.export_dictionary().values() - tmp = list(kinetic_parameters.values()) - tmp.extend(mix_x0) - Estm[i_gene] = tmp - else: - estm = Est() - cur_X_data, cur_X_raw = X[i_gene], X_raw[i_gene] +# incorporate the model selection code soon +def dynamics( + adata: AnnData, + filter_gene_mode: Literal["final", "basic", "no"] = "final", + use_smoothed: bool = True, + assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", + assumption_protein: Literal["ss"] = "ss", + model: Literal["auto", "deterministic", "stochastic"] = "auto", + est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", + NTR_vel: bool = False, + group: Optional[str] = None, + protein_names: Optional[List[str]] = None, + concat_data: bool = False, + log_unnormalized: bool = True, + one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", + fraction_for_deg: bool = False, + re_smooth: bool = False, + sanity_check: bool = False, + del_2nd_moments: Optional[bool] = None, + cores: int = 1, + tkey: str = None, + **est_kwargs, +) -> AnnData: + """Inclusive model of expression dynamics considers splicing, metabolic labeling and protein translation. - _, cost[i_gene] = estm.auto_fit(np.unique(time), cur_X_data) - Estm[i_gene] = estm.export_parameters()[1:] + The function supports learning high-dimensional velocity vector samples for droplet based (10x, inDrop, drop-seq, + etc), scSLAM-seq, NASC-seq sci-fate, scNT-seq, scEU-seq, cite-seq or REAP-seq datasets. - if issparse(cur_X_raw[0, 0]): - cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) - # model_1, kinetic_parameters, mix_x0 = estm.export_dictionary().values() - # tmp = list(kinetic_parameters.values()) - # tmp.extend(mix_x0) - # Estm[i_gene] = tmp + Args: + adata: an AnnData object. + filter_gene_mode: The string for indicating which mode of gene filter will be used. Defaults to "final". + use_smoothed: whether to use the smoothed data when estimating kinetic parameters and calculating velocity for + each gene. When you have time-series data (`tkey` is not None), we recommend to smooth data among cells from + each time point. Defaults to True. + assumption_mRNA: Parameter estimation assumption for mRNA. Available options are: + (1) 'ss': pseudo steady state; + (2) 'kinetic' or None: degradation and kinetic data without steady state assumption. + (3) 'auto': dynamo will choose a reasonable assumption of the system under study automatically. + If no labelling data exists, assumption_mRNA will automatically set to be 'ss'. For one-shot experiment, + assumption_mRNA is set to be None. However we will use steady state assumption to estimate parameters alpha + and gamma either by a deterministic linear regression or the first order decay approach in line of the + sci-fate paper; + Defaults to "auto". + assumption_protein: Parameter estimation assumption for protein. Available options are: + (1) 'ss': pseudo steady state; + Defaults to "ss". + model: String indicates which estimation model will be used. + Available options are: + (1) 'deterministic': The method based on `deterministic` ordinary differential equations; + (2) 'stochastic' or `moment`: The new method from us that is based on `stochastic` master equations; + Note that `kinetic` model doesn't need to assumes the `experiment_type` is not `conventional`. As other + labeling experiments, if you specify the `tkey`, dynamo can also apply `kinetic` model on `conventional` + scRNA-seq datasets. A "model_selection" model will be supported soon in which alpha, beta and gamma will be + modeled as a function of time. + Defaults to "auto". + est_method: This parameter should be used in conjunction with `model` parameter. + Available options when the `model` is 'ss' include: + (1) 'ols': The canonical method or Ordinary Least Squares regression from the seminar RNA velocity paper + based on deterministic ordinary differential equations; + (2) 'rlm': The robust linear models from statsmodels. Robust Regression provides an alternative to OLS + regression by lowering the restrictions on assumptions and dampens the effect of outliers in order + to fit majority of the data. + (3) 'ransac': RANSAC (RANdom SAmple Consensus) algorithm for robust linear regression. RANSAC is an + iterative algorithm for the robust estimation of parameters from a subset of inliers from the + complete dataset. RANSAC implementation is based on RANSACRegressor function from sklearn package. + Note that if `rlm` or `ransac` failed, it will roll back to the `ols` method. In addition, `ols`, + `rlm` and `ransac` can be only used in conjunction with the `deterministic` model. + (4) 'gmm': The new generalized methods of moments from us that is based on master equations, similar to + the "moment" model in the excellent scVelo package; + (5) 'negbin': The new method from us that models steady state RNA expression as a negative binomial + distribution, also built upon on master equations. + (6) 'auto': dynamo will choose the suitable estimation method based on the `assumption_mRNA`, + `experiment_type` and `model` parameter. + Note that all those methods require using extreme data points (except negbin, which use all data points) for + estimation. Extreme data points are defined as the data from cells whose expression of unspliced / spliced + or new / total RNA, etc. are in the top or bottom, 5%, for example. `linear_regression` only considers the + mean of RNA species (based on the `deterministic` ordinary different equations) while moment based methods + (`gmm`, `negbin`) considers both first moment (mean) and second moment (uncentered variance) of RNA species + (based on the `stochastic` master equations). + The above method are all (generalized) linear regression based method. In order to return estimated + parameters (including RNA half-life), it additionally returns R-squared (either just for extreme data points + or all data points) as well as the log-likelihood of the fitting, which will be used for transition matrix + and velocity embedding. + Available options when the `assumption_mRNA` is 'kinetic' include: + (1) 'auto': dynamo will choose the suitable estimation method based on the `assumption_mRNA`, + `experiment_type` and `model` parameter. + (2) `twostep`: first for each time point, estimate K (1-e^{-rt}) using the total and new RNA data. Then + use regression via t-np.log(1-K) to get degradation rate gamma. When splicing and labeling data both + exist, replacing new/total with ul/u can be used to estimate beta. Suitable for velocity estimation. + (3) `direct` (default): method that directly uses the kinetic model to estimate rate parameters, + generally not good for velocity estimation. + Under `kinetic` model, choosing estimation is `experiment_type` dependent. For `kinetics` experiments, + dynamo supposes methods including RNA bursting or without RNA bursting. Dynamo also adaptively estimates + parameters, based on whether the data has splicing or without splicing. + Under `kinetic` assumption, the above method uses non-linear least square fitting. In order to return + estimated parameters (including RNA half-life), it additionally returns the log-likelihood of the + fitting, which will be used for transition matrix and velocity embedding. + All `est_method` uses least square to estimate optimal parameters with latin cubic sampler for initial + sampling. Defaults to "auto". + NTR_vel: whether to use NTR (new/total ratio) velocity for labeling datasets. Defaults to False. + group: the column key/name that identifies the grouping information (for example, clusters that correspond to + different cell types) of cells. This will be used to calculate 1/2 st moments and covariance for each cells + in each group. It will also enable estimating group-specific (i.e cell-type specific) kinetic parameters. + Defaults to None. + protein_names: a list of gene names corresponds to the rows of the measured proteins in the `X_protein` of the + `obsm` attribute. The names have to be included in the adata.var.index. Defaults to None. + concat_data: whether to concatenate data before estimation. If your data is a list of matrices for each time + point, this need to be set as True. Defaults to False. + log_unnormalized: whether to log transform the unnormalized data. Defaults to True. + one_shot_method: The method that will be used for estimating kinetic parameters for one-shot experiment data. + (1) the "sci-fate" method directly solves gamma with the first-order decay model; + (2) the "combined" model uses the linear regression under steady state to estimate relative gamma, and then + calculate absolute gamma (degradation rate), beta (splicing rate) and cell-wise alpha (transcription + rate). Defaults to "combined". + fraction_for_deg: whether to use the fraction of labeled RNA instead of the raw labeled RNA to estimate the + degradation parameter. Defaults to False. + re_smooth: whether to re-smooth the adata and also recalculate 1/2 moments or covariance. Defaults to False. + sanity_check: whether to perform sanity-check before estimating kinetic parameters and velocity vectors, + currently only applicable to kinetic or degradation metabolic labeling based scRNA-seq data. The basic idea + is that for kinetic (degradation) experiment, the total labelled RNA for each gene should increase + (decrease) over time. If they don't satisfy this criteria, those genes will be ignored during the + estimation. Defaults to False. + del_2nd_moments: whether to remove second moments or covariances. Default it is `False` so this avoids + recalculating 2nd moments or covariance but it may take a lot memory when your dataset is big. Set this to + `True` when your data is huge (like > 25, 000 cells or so) to reducing the memory footprint. Defaults to + None. + cores: number of cores to run the estimation. If cores is set to be > 1, multiprocessing will be used to + parallel the parameter estimation. Currently only applicable cases when assumption_mRNA is `ss` or cases + when experiment_type is either "one-shot" or "mix_std_stm". Defaults to 1. + tkey: the column key for the labeling time of cells in .obs. Used for labeling based scRNA-seq data. If `tkey` + is None, then `adata.uns["pp"]["tkey"]` will be checked and used if exists. Defaults to None. + **est_kwargs: Other arguments passed to the fit method (steady state models) or estimation methods (kinetic + models). - X_data[i_gene] = cur_X_data - if self.model.lower().startswith("mixture"): - X_fit_data[i_gene] = estm.simulator.x.T - X_fit_data[i_gene][estm.model1.n_species:] *= estm.scale - else: - if hasattr(estm, "extract_data_from_simulator"): - X_fit_data[i_gene] = estm.extract_data_from_simulator() - else: - X_fit_data[i_gene] = estm.simulator.x.T + Raises: + ValueError: preprocessing not performed. + Exception: No gene pass filter. + Exception: Too few valid genes. - half_life[i_gene] = estm.calc_half_life("gamma") + Returns: + An updated AnnData object with estimated kinetic parameters, inferred velocity and estimation related + information included. The estimated kinetic parameters are currently appended to .obs (should move to .obsm with + the key `dynamics` later). Depends on the estimation method, experiment type and whether you applied estimation + for each groups via `group`, the number of returned parameters can be variable. For conventional scRNA-seq + (including cite-seq or other types of protein/RNA coassays) and somethings metabolic labeling data, the + parameters will at mostly include: + alpha: Transcription rate + beta: Splicing rate + gamma: Spliced RNA degradation rate + eta: Translation rate (only applicable to RNA/protein coassay) + delta: Protein degradation rate (only applicable to RNA/protein coassay) + alpha_b: intercept of alpha fit + beta_b: intercept of beta fit + gamma_b: intercept of gamma fit + eta_b: intercept of eta fit (only applicable to RNA/protein coassay) + delta_b: intercept of delta fit (only applicable to RNA/protein coassay) + alpha_r2: r-squared for goodness of fit of alpha estimation + beta_r2: r-squared for goodness of fit of beta estimation + gamma_r2: r-squared for goodness of fit of gamma estimation + eta_r2: r-squared for goodness of fit of eta estimation (only applicable to RNA/protein coassay) + delta_r2: r-squared for goodness of fit of delta estimation (only applicable to RNA/protein coassay) + alpha_logLL: loglikelihood of alpha estimation (only applicable to stochastic model) + beta_loggLL: loglikelihood of beta estimation (only applicable to stochastic model) + gamma_logLL: loglikelihood of gamma estimation (only applicable to stochastic model) + eta_logLL: loglikelihood of eta estimation (only applicable to stochastic model and RNA/protein coassay) + delta_loggLL: loglikelihood of delta estimation (only applicable to stochastic model and RNA/protein + coassay) + uu0: estimated amount of unspliced unlabeled RNA at time 0 (only applicable to data with both splicing + and labeling) + ul0: estimated amount of unspliced labeled RNA at time 0 (only applicable to data with both splicing + and labeling) + su0: estimated amount of spliced unlabeled RNA at time 0 (only applicable to data with both splicing + and labeling) + sl0: estimated amount of spliced labeled RNA at time 0 (only applicable to data with both splicing and + labeling) + U0: estimated amount of unspliced RNA (uu + ul) at time 0 + S0: estimated amount of spliced (su + sl) RNA at time 0 + total0: estimated amount of spliced (U + S) RNA at time 0 + half_life: Spliced mRNA's half-life (log(2) / gamma) - if self.model.lower().startswith("mixture"): - species = [0, 1, 2, 3] if self.has_splicing else [0, 1] - gof = GoodnessOfFit(estm.export_model(), params=estm.export_parameters()) - gof.prepare_data(time, cur_X_raw.T, species=species, normalize=True) - else: - gof = GoodnessOfFit( - estm.export_model(), - params=estm.export_parameters(), - x0=estm.simulator.x0, - ) - gof.prepare_data(time, cur_X_raw.T, normalize=True) + Note that all data points are used when estimating r2 although only extreme data points are used for + estimating r2. This is applicable to all estimation methods, either `linear_regression`, `gmm` or `negbin`. + By default we set the intercept to be 0. - logLL[i_gene] = gof.calc_mean_squared_deviation() # .calc_gaussian_loglikelihood() + For metabolic labeling data, the kinetic parameters will at most include: + alpha: Transcription rate (effective - when RNA promoter switching considered) + beta: Splicing rate + gamma: Spliced RNA degradation rate + a: Switching rate from active promoter state to inactive promoter state + b: Switching rate from inactive promoter state to active promoter state + alpha_a: Transcription rate for active promoter + alpha_i: Transcription rate for inactive promoter + cost: cost of the kinetic parameters estimation + logLL: loglikelihood of kinetic parameters estimation + alpha_r2: r-squared for goodness of fit of alpha estimation + beta_r2: r-squared for goodness of fit of beta estimation + gamma_r2: r-squared for goodness of fit of gamma estimation + uu0: estimated amount of unspliced unlabeled RNA at time 0 (only applicable to data with both splicing + and labeling) + ul0: estimated amount of unspliced labeled RNA at time 0 (only applicable to data with both splicing + and labeling) + su0: estimated amount of spliced unlabeled RNA at time 0 (only applicable to data with both splicing + and labeling) + sl0: estimated amount of spliced labeled RNA at time 0 (only applicable to data with both splicing and + labeling) + u0: estimated amount of unspliced RNA (including uu, ul) at time 0 + s0: estimated amount of spliced (including su, sl) RNA at time 0 + total0: estimated amount of spliced (including U, S) RNA at time 0 + p_half_life: half-life for unspliced mRNA + half_life: half-life for spliced mRNA - if self.est_method == "twostep" and self.has_splicing: - layers = ["M_u", "M_s"] if ("M_u" in subset_adata.layers.keys() and data_type == "smoothed") else ["X_u", - "X_s"] - U, S = ( - subset_adata.layers[layers[0]].T, - subset_adata.layers[layers[1]].T, - ) - US, S2 = subset_adata.layers["M_us"].T, subset_adata.layers["M_ss"].T - # beta, beta_r2 = lin_reg_gamma_synthesis(U, Ul, time, perc_right=100) - gamma_k, gamma_b, gamma_all_r2, gamma_all_logLL = fit_slope_stochastic( - S, U, US, S2, perc_left=None, perc_right=5 - ) + If sanity_check has performed, a column with key `sanity_check` will also included which indicates which + gene passes filter (`filter_gene_mode`) and sanity check. This is only applicable to kinetic and degradation + metabolic labeling experiments. - Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) - Estm_df["gamma_k"] = gamma_k # gamma_k = gamma / beta - Estm_df["beta"] = Estm_df["gamma"] / gamma_k # gamma_k = gamma / beta - Estm_df["gamma_r2"] = gamma_all_r2 - else: - Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) + In addition, the `dynamics` key of the .uns attribute corresponds to a dictionary that includes the + following keys: + t: An array like object that indicates the time point of each cell used during parameters estimation + (applicable only to kinetic models) + group: The group that you used to estimate parameters group-wise + X_data: The input that was used for estimating parameters (applicable only to kinetic models) + X_fit_data: The data that was fitted during parameters estimation (applicable only to kinetic models) + asspt_mRNA: Assumption of mRNA dynamics (steady state or kinetic) + experiment_type: Experiment type (either conventional or metabolic labeling based) + normalized: Whether to normalize data + model: Model used for the parameter estimation (either auto, deterministic or stochastic) + has_splicing: Does the adata has splicing? detected automatically + has_labeling: Does the adata has labelling? detected automatically + has_protein: Does the adata has protein information? detected automatically + use_smoothed: Whether to use smoothed data (or first moment, done via local average of neighbor cells) + NTR_vel: Whether to estimate NTR velocity + log_unnormalized: Whether to log transform unnormalized data. + """ - return Estm_df, half_life, cost, logLL, _param_ranges, X_data, X_fit_data + del_2nd_moments = DynamoAdataConfig.use_default_var_if_none( + del_2nd_moments, DynamoAdataConfig.DYNAMICS_DEL_2ND_MOMENTS_KEY + ) + if "pp" not in adata.uns_keys(): + raise ValueError(f"\nPlease run `dyn.pp.receipe_monocle(adata)` before running this function!") + if tkey is None: + tkey = adata.uns["pp"]["tkey"] + (experiment_type, has_splicing, has_labeling, splicing_labeling, has_protein,) = ( + adata.uns["pp"]["experiment_type"], + adata.uns["pp"]["has_splicing"], + adata.uns["pp"]["has_labeling"], + adata.uns["pp"]["splicing_labeling"], + adata.uns["pp"]["has_protein"], + ) - def calculate_vel_U( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Union[ndarray, csr_matrix]: - return np.nan - - def calculate_vel_S( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Union[ndarray, csr_matrix]: - return vel.vel_s(U, S) - - def calculate_vel_N( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Union[ndarray, csr_matrix]: - return np.nan + X_data, X_fit_data = None, None + filter_list, filter_gene_mode_list = ( + [ + "use_for_pca", + "pass_basic_filter", + "no", + ], + ["final", "basic", "no"], + ) + filter_checker = [i in adata.var.columns for i in filter_list[:2]] + filter_checker.append(True) + filter_id = filter_gene_mode_list.index(filter_gene_mode) + which_filter = np.where(filter_checker[filter_id:])[0][0] + filter_id - def calculate_vel_T( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Union[ndarray, csr_matrix]: - return np.nan + filter_gene_mode = filter_gene_mode_list[which_filter] + valid_bools = get_valid_bools(adata, filter_gene_mode) + gene_num = sum(valid_bools) + if gene_num == 0: + raise Exception(f"no genes pass filter. Try resetting `filter_gene_mode = 'no'` to use all genes.") -class MixStdStmDynamics(LabeledDynamics): - """Dynamics model for the mixed steady state and stimulation labeling (mix_std_stm) experiment.""" - def calculate_vel_U( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Union[ndarray, csr_matrix]: - return self.alpha1 - csr_matrix(self.beta[:, None]).multiply(U) + if model.lower() == "auto": + model = "stochastic" + model_was_auto = True + else: + model_was_auto = False - def calculate_vel_S( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Union[ndarray, csr_matrix]: - return vel.vel_s(U, S) + if tkey is not None: + if adata.obs[tkey].max() > 60: + main_warning( + "Looks like you are using minutes as the time unit. For the purpose of numeric stability, " + "we recommend using hour as the time unit." + ) - def calculate_vel_N( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Union[ndarray, csr_matrix]: - return self.alpha1 - csr_matrix(self.gamma[:, None]).multiply(self.u_new) + if model.lower() == "stochastic" or use_smoothed or re_smooth: + M_layers = [i for i in adata.layers.keys() if i.startswith("M_")] - def calculate_vel_T( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Union[ndarray, csr_matrix]: - return self.alpha1 - csr_matrix(self.gamma[:, None]).multiply(T) + if len(M_layers) < 2 or re_smooth: + main_info("removing existing M layers:%s..." % (str(list(M_layers))), indent_level=2) + for i in M_layers: + del adata.layers[i] + main_info("making adata smooth...", indent_level=2) - def calculate_vels( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Tuple: - """Override the velocity calculation function to calculate extra parameters u_new and alpha1.""" - if self.has_splicing: - u0, self.u_new, self.alpha1 = solve_alpha_2p_mat( - t0=np.max(self.t) - self.t, - t1=self.t, - alpha0=self.alpha[0], - beta=self.beta, - u1=N, + if group is not None and group in adata.obs.columns: + moments(adata, genes=valid_bools, group=group) + else: + moments(adata, genes=valid_bools, group=tkey) + elif tkey is not None: + main_warning( + f"You used tkey {tkey} (or group {group}), but you have calculated local smoothing (1st moment) " + f"for your data before. Please ensure you used the desired tkey or group when the smoothing was " + f"performed. Try setting re_smooth = True if not sure." ) - vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) - vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) - else: - u0, self.u_new, self.alpha1 = solve_alpha_2p_mat( - t0=np.max(self.t) - self.t, - t1=self.t, - alpha0=self.alpha[0], - beta=self.gamma, - u1=N, + + valid_adata = adata[:, valid_bools].copy() + if group is not None and group in adata.obs.columns: + _group = adata.obs[group].unique() + if any(adata.obs[group].value_counts() < 50): + main_warning( + f"Note that some groups have less than 50 cells, this may lead to the velocities for some " + f"cells are all NaN values and cause issues for all downstream analysis. Please try to " + f"coarse-grain cell groupings. Cell number for each group are {adata.obs[group].value_counts()}" ) - vel_U, vel_S = np.nan, np.nan - vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) - vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) - return vel_U, vel_S, vel_N, vel_T + else: + _group = ["_all_cells"] -class MixKineticsDynamics(LabeledDynamics): - """Dynamics model for two mix experiment type: mix_kin_deg and mix_pulse_chase.""" - def estimate_params_utils(self, params_est_kwargs): + for cur_grp_i, cur_grp in enumerate(_group): + if cur_grp == "_all_cells": + kin_param_pre = "" + cur_cells_bools = np.ones(valid_adata.shape[0], dtype=bool) + subset_adata = valid_adata[cur_cells_bools] + else: + kin_param_pre = str(group) + "_" + str(cur_grp) + "_" + cur_cells_bools = (valid_adata.obs[group] == cur_grp).values + subset_adata = valid_adata[cur_cells_bools] + + if model.lower() == "stochastic" or use_smoothed: + moments(subset_adata) ( + U, + Ul, + S, + Sl, + P, + US, + U2, + S2, + t, + normalized, + ind_for_proteins, + assump_mRNA, + ) = get_data_for_kin_params_estimation( subset_adata, - data_type, - return_ntr, - ) = params_est_kwargs["subset_adata"], params_est_kwargs["data_type"], params_est_kwargs["return_ntr"] - has_switch = True - param_rngs = {} - time = subset_adata.obs[self.tkey].astype("float").values - total_layer = "M_t" if ("M_t" in subset_adata.layers.keys() and data_type == "smoothed") else "X_total" + has_splicing, + has_labeling, + model, + use_smoothed, + tkey, + protein_names, + log_unnormalized, + NTR_vel, + ) - if self.model.lower() in ["deterministic"]: - layer = "M_n" if ("M_n" in subset_adata.layers.keys() and data_type == "smoothed") else "X_new" - X, X_raw = prepare_data_no_splicing( - subset_adata, - subset_adata.var.index, - time, - layer=layer, - total_layer=total_layer, - ) - if self.model.lower() == "deterministic": - X = [X[i][0, :] for i in range(len(X))] - _param_ranges = { - "alpha": [0, 1000], - "gamma": [0, 1000], - } - x0 = {"u0": [0, 1000]} - Est = Estimation_KineticChase - else: - raise NotImplementedError( - f"only `deterministic` model implemented for mix_pulse_chase/mix_kin_deg experiment!" + valid_bools_ = valid_bools.copy() + if sanity_check and experiment_type.lower() in ["kin", "deg"]: + indices_valid_bools = np.where(valid_bools)[0] + t, L = ( + t.flatten(), + (0 if Ul is None else Ul) + (0 if Sl is None else Sl), ) - _param_ranges = update_dict(_param_ranges, param_rngs) - x0_ = np.vstack([ran for ran in x0.values()]).T + t_uniq = np.unique(t) - n_genes = subset_adata.n_vars - cost, logLL = np.zeros(n_genes), np.zeros(n_genes) - all_keys = list(_param_ranges.keys()) + list(x0.keys()) - all_keys = [cur_key for cur_key in all_keys if cur_key != "alpha_i"] - half_life, Estm = np.zeros(n_genes), [None] * n_genes - X_data, X_fit_data = [None] * n_genes, [None] * n_genes - if self.experiment_type: - popt = [None] * n_genes + valid_gene_checker = np.zeros(gene_num, dtype=bool) + for L_iter, cur_L in tqdm( + enumerate(L), + desc=f"sanity check of {experiment_type} experiment data:", + ): + cur_L = cur_L.A.flatten() if issparse(cur_L) else cur_L.flatten() + y = strat_mom(cur_L, t, np.nanmean) + slope, _ = fit_linreg(t_uniq, y, intercept=True, r2=False) + valid_gene_checker[L_iter] = ( + True + if (slope > 0 and experiment_type == "kin") or (slope < 0 and experiment_type == "deg") + else False + ) + valid_bools_[indices_valid_bools[~valid_gene_checker]] = False + main_warning(f"filtering {gene_num - valid_gene_checker.sum()} genes after sanity check.") - main_debug("model: %s, experiment_type: %s" % (self.model, self.experiment_type)) - for i_gene in tqdm(range(n_genes), desc="estimating kinetic-parameters using kinetic model"): - if self.model.lower().startswith("mixture"): - estm = Est - if self.model.lower() == "mixture": - cur_X_data = np.vstack([X[i_layer][i_gene] for i_layer in range(len(X))]) - if issparse(X_raw[0]): - cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene].A for i_layer in range(len(X))]) - else: - cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene] for i_layer in range(len(X))]) - else: - cur_X_data = X[i_gene] - cur_X_raw = X_raw[i_gene] - - if issparse(cur_X_raw[0, 0]): - cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) + if len(valid_bools_) < 5: + raise Exception( + f"After sanity check, you have less than 5 valid genes. Something is wrong about your " + f"metabolic labeling experiment!" + ) - _, cost[i_gene] = estm.auto_fit(np.unique(time), cur_X_data) - ( - model_1, - model_2, - kinetic_parameters, - mix_x0, - ) = estm.export_dictionary().values() - tmp = list(kinetic_parameters.values()) - tmp.extend(mix_x0) - Estm[i_gene] = tmp - else: - estm = Est() - cur_X_data, cur_X_raw = X[i_gene], X_raw[i_gene] + U, Ul, S, Sl = ( + (None if U is None else U[valid_gene_checker, :]), + (None if Ul is None else Ul[valid_gene_checker, :]), + (None if S is None else S[valid_gene_checker, :]), + (None if Sl is None else Sl[valid_gene_checker, :]), + ) + subset_adata = subset_adata[:, valid_gene_checker] + adata.var[kin_param_pre + "sanity_check"] = valid_bools_ - popt[i_gene], cost[i_gene] = estm.auto_fit(np.unique(time), cur_X_data) - Estm[i_gene] = estm.export_parameters() + if assumption_mRNA.lower() == "auto": + assumption_mRNA = assump_mRNA + if experiment_type.lower() == "conventional": + assumption_mRNA = "ss" + elif experiment_type.lower() in ["mix_pulse_chase", "deg", "kin"]: + assumption_mRNA = "kinetic" - if issparse(cur_X_raw[0, 0]): - cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) - # model_1, kinetic_parameters, mix_x0 = estm.export_dictionary().values() - # tmp = list(kinetic_parameters.values()) - # tmp.extend(mix_x0) - # Estm[i_gene] = tmp + if model.lower() == "stochastic" and experiment_type.lower() not in [ + "conventional", + "kinetics", + "degradation", + "kin", + "deg", + "one-shot", + ]: + """ + # temporially convert to deterministic model as moment model for mix_std_stm + and other types of labeling experiment is ongoing.""" - X_data[i_gene] = cur_X_data - if self.model.lower().startswith("mixture"): - X_fit_data[i_gene] = estm.simulator.x.T - X_fit_data[i_gene][estm.model1.n_species:] *= estm.scale - else: - # kinetic chase simulation - kinetic_chase = estm.simulator.x.T - # hidden x - tt, h = estm.simulator.calc_init_conc() + model = "deterministic" - X_fit_data[i_gene] = [kinetic_chase, [tt, h]] + if model_was_auto and experiment_type.lower() in [ + "kinetic", + "kin", + "degradation", + "deg", + ]: + model = "deterministic" - half_life[i_gene] = estm.calc_half_life("gamma") + if assumption_mRNA.lower() == "ss" or (experiment_type.lower() in ["one-shot", "mix_std_stm"]): + if est_method.lower() == "auto": + est_method = "gmm" if model.lower() == "stochastic" else "ols" - if self.model.lower().startswith("mixture"): - species = [0, 1, 2, 3] if self.has_splicing else [0, 1] - gof = GoodnessOfFit(estm.export_model(), params=estm.export_parameters()) - gof.prepare_data(time, cur_X_raw.T, species=species, normalize=True) + if experiment_type.lower() == "one-shot": + beta = subset_adata.var.beta if "beta" in subset_adata.var.keys() else None + gamma = subset_adata.var.gamma if "gamma" in subset_adata.var.keys() else None + ss_estimation_kwargs = {"beta": beta, "gamma": gamma} else: - gof = GoodnessOfFit( - estm.export_model(), - params=estm.export_parameters(), - x0=estm.simulator.x0, - ) - gof.prepare_data(time, cur_X_raw.T, normalize=True) - - logLL[i_gene] = gof.calc_mean_squared_deviation() # .calc_gaussian_loglikelihood() + ss_estimation_kwargs = {} - if self.est_method == "twostep": - if self.has_splicing: - layers = ( - ["M_u", "M_s"] if ("M_u" in subset_adata.layers.keys() and data_type == "smoothed") else ["X_u", - "X_s"] - ) - U, S = ( - subset_adata.layers[layers[0]].T, - subset_adata.layers[layers[1]].T, - ) - US, S2 = ( - subset_adata.layers["M_us"].T, - subset_adata.layers["M_ss"].T, - ) - # beta, beta_r2 = lin_reg_gamma_synthesis(U, Ul, time, perc_right=100) - ( - gamma_k, - gamma_b, - gamma_all_r2, - gamma_all_logLL, - ) = fit_slope_stochastic(S, U, US, S2, perc_left=None, perc_right=5) + est = ss_estimation( + U=U.copy() if U is not None else None, + Ul=Ul.copy() if Ul is not None else None, + S=S.copy() if S is not None else None, + Sl=Sl.copy() if Sl is not None else None, + P=P.copy() if P is not None else None, + US=US.copy() if US is not None else None, + S2=S2.copy() if S2 is not None else None, + conn=subset_adata.obsp["moments_con"], + t=t, + ind_for_proteins=ind_for_proteins, + model=model, + est_method=est_method, + experiment_type=experiment_type, + assumption_mRNA=assumption_mRNA, + assumption_protein=assumption_protein, + concat_data=concat_data, + cores=cores, + **ss_estimation_kwargs, + ) # U: (unlabeled) unspliced; S: (unlabeled) spliced; U / Ul: old and labeled; U, Ul, S, Sl: uu/ul/su/sl - Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) - Estm_df["gamma_k"] = gamma_k # gamma_k = gamma / beta - Estm_df["beta"] = Estm_df["gamma"] / gamma_k # gamma_k = gamma / beta - Estm_df["gamma_r2"] = gamma_all_r2 - else: - Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) - Estm_df["gamma_k"] = Estm_df["gamma"] # fix a bug in pl.dynamics - else: - Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) + with warnings.catch_warnings(): + warnings.simplefilter("ignore") - return Estm_df, half_life, cost, logLL, _param_ranges, X_data, X_fit_data + if experiment_type.lower() in ["one-shot", "one_shot"]: + est.fit(one_shot_method=one_shot_method, **est_kwargs) + else: + # experiment_type can be `kin` also and by default use + # conventional method to estimate k but correct for time + est.fit(**est_kwargs) - def calculate_vel_U( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Union[ndarray, csr_matrix]: - return vel.vel_u(U, repeat=True) + alpha, beta, gamma, eta, delta = est.parameters.values() - def calculate_vel_S( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Union[ndarray, csr_matrix]: - return vel.vel_s(U, S) + U, S = get_U_S_for_velocity_estimation( + subset_adata, + use_smoothed, + has_splicing, + has_labeling, + log_unnormalized, + NTR_vel, + ) + vel = Velocity(estimation=est) - def calculate_vel_N( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Union[ndarray, csr_matrix]: - return vel.vel_u(N, repeat=True) + if experiment_type.lower() in [ + "one_shot", + "one-shot", + "kin", + "mix_std_stm", + ]: + U_, S_ = get_U_S_for_velocity_estimation( + subset_adata, + use_smoothed, + has_splicing, + has_labeling, + log_unnormalized, + not NTR_vel, + ) - def calculate_vel_T( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Union[ndarray, csr_matrix]: - return vel.vel_u(T) if not self.has_splicing and self.NTR_vel else vel.vel_u(T, repeat=True) + # also get vel_N and vel_T + if NTR_vel: + if has_splicing: + if experiment_type == "kin": + Kc = np.clip(gamma[:, None], 0, 1 - 1e-3) # S - U slope + gamma_ = -(np.log(1 - Kc) / t[None, :]) # actual gamma - def calculate_vels( - self, - vel: Velocity, - U: Union[ndarray, csr_matrix], - S: Union[ndarray, csr_matrix], - N: Union[ndarray, csr_matrix], - T: Union[ndarray, csr_matrix], - ) -> Tuple: - """Override the velocity calculation function to reset beta when the data contains splicing information.""" - if self.has_splicing: - vel_U = self.calculate_vel_U(vel=vel, U=U, S=S, N=N, T=T) - vel_S = self.calculate_vel_S(vel=vel, U=U, S=S, N=N, T=T) - vel.parameters["beta"] = self.gamma - else: - vel_U, vel_S = np.nan, np.nan - vel_N = self.calculate_vel_N(vel=vel, U=U, S=S, N=N, T=T) - vel_T = self.calculate_vel_T(vel=vel, U=U, S=S, N=N, T=T) - return vel_U, vel_S, vel_N, vel_T + vel_U = U.multiply(csr_matrix(gamma_ / Kc)) - csr_matrix(beta).multiply(U_) # vel.vel_s(U_) + vel_S = vel.vel_s(U_, S_) + vel_N = (U - csr_matrix(Kc).multiply(U)).multiply(csr_matrix(gamma_ / Kc)) # vel.vel_u(U) + # scale back to true velocity via multiplying "gamma_ / Kc". + vel_T = (U - csr_matrix(Kc).multiply(S)).multiply(csr_matrix(gamma_ / Kc)) + elif experiment_type == "mix_std_stm": + # steady state RNA: u0, stimulation RNA: u_new; + # cell-wise transcription rate under simulation: alpha1 + u0, u_new, alpha1 = solve_alpha_2p_mat( + t0=np.max(t) - t, + t1=t, + alpha0=alpha[0], + beta=beta, + u1=U, + ) + vel_U = alpha1 - csr_matrix(beta[:, None]).multiply(U_) + vel_S = vel.vel_s(U_, S_) -# TODO: rename this later -def dynamics_wrapper( - adata: AnnData, - filter_gene_mode: Literal["final", "basic", "no"] = "final", - use_smoothed: bool = True, - assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", - assumption_protein: Literal["ss"] = "ss", - model: Literal["auto", "deterministic", "stochastic"] = "auto", - est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", - NTR_vel: bool = False, - group: Optional[str] = None, - protein_names: Optional[List[str]] = None, - concat_data: bool = False, - log_unnormalized: bool = True, - one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", - fraction_for_deg: bool = False, - re_smooth: bool = False, - sanity_check: bool = False, - del_2nd_moments: Optional[bool] = None, - cores: int = 1, - tkey: str = None, - **est_kwargs, -) -> AnnData: - """Predict the model and assumption if they are set as auto. Run corresponding Dynamics methods according to the - experiment type. More information can be found in the class BaseDynamics.""" - if "pp" not in adata.uns_keys(): - raise ValueError(f"\nPlease run `dyn.pp.receipe_monocle(adata)` before running this function!") - if model.lower() == "auto": - model = "stochastic" - model_was_auto = True - else: - model = model - model_was_auto = False + vel_N = alpha1 - csr_matrix(gamma[:, None]).multiply(u_new) + vel_T = alpha1 - csr_matrix(beta[:, None]).multiply(S) + else: + vel_U = vel.vel_u(U_) + vel_S = vel.vel_s(U_, S_) + vel_N = vel.vel_u(U) + vel_T = vel.vel_s(U, S - U) # need to consider splicing + else: + if experiment_type == "kin": + vel_U = np.nan + vel_S = np.nan - (experiment_type, has_splicing, has_labeling, splicing_labeling, has_protein,) = ( - adata.uns["pp"]["experiment_type"], - adata.uns["pp"]["has_splicing"], - adata.uns["pp"]["has_labeling"], - adata.uns["pp"]["splicing_labeling"], - adata.uns["pp"]["has_protein"], - ) + Kc = np.clip(gamma[:, None], 0, 1 - 1e-3) # S - U slope + gamma_ = -(np.log(1 - Kc) / t[None, :]) # actual gamma + vel_N = (U - csr_matrix(Kc).multiply(U)).multiply(csr_matrix(gamma_ / Kc)) # vel.vel_u(U) + # scale back to true velocity via multiplying "gamma_ / Kc". + vel_T = (U - csr_matrix(Kc).multiply(S)).multiply(csr_matrix(gamma_ / Kc)) + elif experiment_type == "mix_std_stm": + vel_U = np.nan + vel_S = np.nan - (NTR_vel, assump_mRNA) = get_auto_assump_mRNA( - subset_adata=adata, - has_splicing=has_splicing, - has_labeling=has_labeling, - use_moments=use_smoothed, - tkey=tkey, - NTR_vel=NTR_vel, - ) - if assumption_mRNA.lower() == "auto": - assumption_mRNA = assump_mRNA - if experiment_type.lower() == "conventional": - assumption_mRNA = "ss" - elif experiment_type.lower() in ["mix_pulse_chase", "deg", "kin"]: - assumption_mRNA = "kinetic" + # steady state RNA: u0, stimulation RNA: u_new; + # cell-wise transcription rate under simulation: alpha1 + u0, u_new, alpha1 = solve_alpha_2p_mat( + t0=np.max(t) - t, + t1=t, + alpha0=alpha[0], + beta=gamma, + u1=U, + ) - if model.lower() == "stochastic" and experiment_type.lower() not in [ - "conventional", - "kinetics", - "degradation", - "kin", - "deg", - "one-shot", - ]: - """ - # temporially convert to deterministic model as moment model for mix_std_stm - and other types of labeling experiment is ongoing.""" + vel_N = alpha1 - csr_matrix(gamma[:, None]).multiply(u_new) + vel_T = alpha1 - csr_matrix(gamma[:, None]).multiply(S) + else: + vel_U = np.nan + vel_S = np.nan + vel_N = vel.vel_u(U) + vel_T = vel.vel_u(S) # don't consider splicing + else: + if has_splicing: + if experiment_type == "kin": + Kc = np.clip(gamma[:, None], 0, 1 - 1e-3) # S - U slope + gamma_ = -(np.log(1 - Kc) / t[None, :]) # actual gamma - model = "deterministic" + vel_U = U_.multiply(csr_matrix(gamma_ / Kc) - csr_matrix(beta).multiply(U)) # vel.vel_u(U) + vel_S = vel.vel_s(U, S) - if model_was_auto and experiment_type.lower() in [ - "kinetic", - "kin", - "degradation", - "deg", - ]: - model = "deterministic" + vel_N = (U_ - csr_matrix(Kc).multiply(U_)).multiply( + csr_matrix(gamma_ / Kc) + ) # vel.vel_u(U_) + # scale back to true velocity via multiplying "gamma_ / Kc". + vel_T = (U_ - csr_matrix(Kc).multiply(S_)).multiply(csr_matrix(gamma_ / Kc)) + elif experiment_type == "mix_std_stm": + # steady state RNA: u0, stimulation RNA: u_new; + # cell-wise transcription rate under simulation: alpha1 + u0, u_new, alpha1 = solve_alpha_2p_mat( + t0=np.max(t) - t, + t1=t, + alpha0=alpha[0], + beta=beta, + u1=U_, + ) - dynamics_kwargs = { - "adata": adata, - "filter_gene_mode": filter_gene_mode, - "use_smoothed": use_smoothed, - "assumption_mRNA": assumption_mRNA, - "assumption_protein": assumption_protein, - "model": model, - "model_was_auto": model_was_auto, - "experiment_type": experiment_type, - "has_splicing": has_splicing, - "has_labeling": has_labeling, - "splicing_labeling": splicing_labeling, - "has_protein": has_protein, - "est_method": est_method, - "NTR_vel": NTR_vel, - "group": group, - "protein_names": protein_names, - "concat_data": concat_data, - "log_unnormalized": log_unnormalized, - "one_shot_method": one_shot_method, - "fraction_for_deg": fraction_for_deg, - "re_smooth": re_smooth, - "sanity_check": sanity_check, - "del_2nd_moments": del_2nd_moments, - "cores": cores, - "tkey": tkey, - "est_kwargs": est_kwargs, - } + vel_U = alpha1 - csr_matrix(beta[:, None]).multiply(U) + vel_S = vel.vel_s(U, S) - if experiment_type == "conventional": - estimator = SplicedDynamics(dynamics_kwargs) - elif experiment_type in ["one-shot", "one_shot"]: - estimator = OneShotDynamics(dynamics_kwargs) - elif experiment_type == "kin": - if assumption_mRNA == "ss": - estimator = SSKineticsDynamics(dynamics_kwargs) - elif assumption_mRNA == "kinetic": - if model == 'deterministic': - estimator = KineticsDynamics(dynamics_kwargs) - elif model == 'stochastic': - estimator = KineticsStormDynamics(dynamics_kwargs) - else: - raise NotImplementedError("This method has not been implemented.") - elif experiment_type == "deg": - estimator = DegradationDynamics(dynamics_kwargs) - elif experiment_type == "mix_std_stm": - estimator = MixStdStmDynamics(dynamics_kwargs) - elif experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: - estimator = MixKineticsDynamics(dynamics_kwargs) - else: - raise NotImplementedError("This method has not been implemented.") - adata = estimator.estimate() - return adata + vel_N = alpha1 - csr_matrix(gamma[:, None]).multiply(u_new) + vel_T = alpha1 - csr_matrix(beta[:, None]).multiply(S_) + else: + vel_U = vel.vel_u(U) + vel_S = vel.vel_s(U, S) + vel_N = vel.vel_u(U_) + vel_T = vel.vel_s(U_, S_ - U_) # need to consider splicing + else: + if experiment_type == "kin": + vel_U = np.nan + vel_S = np.nan -# incorporate the model selection code soon -def dynamics( - adata: AnnData, - filter_gene_mode: Literal["final", "basic", "no"] = "final", - use_smoothed: bool = True, - assumption_mRNA: Literal["ss", "kinetic", "auto"] = "auto", - assumption_protein: Literal["ss"] = "ss", - model: Literal["auto", "deterministic", "stochastic"] = "auto", - est_method: Literal["ols", "rlm", "ransac", "gmm", "negbin", "auto", "twostep", "direct"] = "auto", - NTR_vel: bool = False, - group: Optional[str] = None, - protein_names: Optional[List[str]] = None, - concat_data: bool = False, - log_unnormalized: bool = True, - one_shot_method: Literal["combined", "sci-fate", "sci_fate"] = "combined", - fraction_for_deg: bool = False, - re_smooth: bool = False, - sanity_check: bool = False, - del_2nd_moments: Optional[bool] = None, - cores: int = 1, - tkey: str = None, - **est_kwargs, -) -> AnnData: - """Inclusive model of expression dynamics considers splicing, metabolic labeling and protein translation. + Kc = np.clip(gamma[:, None], 0, 1 - 1e-3) # S - U slope + gamma_ = -(np.log(1 - Kc) / t[None, :]) # actual gamma + vel_N = (U_ - csr_matrix(Kc).multiply(U_)).multiply( + csr_matrix(gamma_ / Kc) + ) # vel.vel_u(U_) + # scale back to true velocity via multiplying "gamma_ / Kc". + vel_T = (U_ - csr_matrix(Kc).multiply(S_)).multiply(csr_matrix(gamma_ / Kc)) + elif experiment_type == "mix_std_stm": + vel_U = np.nan + vel_S = np.nan - The function supports learning high-dimensional velocity vector samples for droplet based (10x, inDrop, drop-seq, - etc), scSLAM-seq, NASC-seq sci-fate, scNT-seq, scEU-seq, cite-seq or REAP-seq datasets. + # steady state RNA: u0, stimulation RNA: u_new; + # cell-wise transcription rate under simulation: alpha1 + u0, u_new, alpha1 = solve_alpha_2p_mat( + t0=np.max(t) - t, + t1=t, + alpha0=alpha[0], + beta=gamma, + u1=U_, + ) - Args: - adata: an AnnData object. - filter_gene_mode: The string for indicating which mode of gene filter will be used. Defaults to "final". - use_smoothed: whether to use the smoothed data when estimating kinetic parameters and calculating velocity for - each gene. When you have time-series data (`tkey` is not None), we recommend to smooth data among cells from - each time point. Defaults to True. - assumption_mRNA: Parameter estimation assumption for mRNA. Available options are: - (1) 'ss': pseudo steady state; - (2) 'kinetic' or None: degradation and kinetic data without steady state assumption. - (3) 'auto': dynamo will choose a reasonable assumption of the system under study automatically. - If no labelling data exists, assumption_mRNA will automatically set to be 'ss'. For one-shot experiment, - assumption_mRNA is set to be None. However we will use steady state assumption to estimate parameters alpha - and gamma either by a deterministic linear regression or the first order decay approach in line of the - sci-fate paper; - Defaults to "auto". - assumption_protein: Parameter estimation assumption for protein. Available options are: - (1) 'ss': pseudo steady state; - Defaults to "ss". - model: String indicates which estimation model will be used. - Available options are: - (1) 'deterministic': The method based on `deterministic` ordinary differential equations; - (2) 'stochastic' or `moment`: The new method from us that is based on `stochastic` master equations; - Note that `kinetic` model doesn't need to assumes the `experiment_type` is not `conventional`. As other - labeling experiments, if you specify the `tkey`, dynamo can also apply `kinetic` model on `conventional` - scRNA-seq datasets. A "model_selection" model will be supported soon in which alpha, beta and gamma will be - modeled as a function of time. - Defaults to "auto". - est_method: This parameter should be used in conjunction with `model` parameter. - Available options when the `model` is 'ss' include: - (1) 'ols': The canonical method or Ordinary Least Squares regression from the seminar RNA velocity paper - based on deterministic ordinary differential equations; - (2) 'rlm': The robust linear models from statsmodels. Robust Regression provides an alternative to OLS - regression by lowering the restrictions on assumptions and dampens the effect of outliers in order - to fit majority of the data. - (3) 'ransac': RANSAC (RANdom SAmple Consensus) algorithm for robust linear regression. RANSAC is an - iterative algorithm for the robust estimation of parameters from a subset of inliers from the - complete dataset. RANSAC implementation is based on RANSACRegressor function from sklearn package. - Note that if `rlm` or `ransac` failed, it will roll back to the `ols` method. In addition, `ols`, - `rlm` and `ransac` can be only used in conjunction with the `deterministic` model. - (4) 'gmm': The new generalized methods of moments from us that is based on master equations, similar to - the "moment" model in the excellent scVelo package; - (5) 'negbin': The new method from us that models steady state RNA expression as a negative binomial - distribution, also built upon on master equations. - (6) 'auto': dynamo will choose the suitable estimation method based on the `assumption_mRNA`, - `experiment_type` and `model` parameter. - Note that all those methods require using extreme data points (except negbin, which use all data points) for - estimation. Extreme data points are defined as the data from cells whose expression of unspliced / spliced - or new / total RNA, etc. are in the top or bottom, 5%, for example. `linear_regression` only considers the - mean of RNA species (based on the `deterministic` ordinary different equations) while moment based methods - (`gmm`, `negbin`) considers both first moment (mean) and second moment (uncentered variance) of RNA species - (based on the `stochastic` master equations). - The above method are all (generalized) linear regression based method. In order to return estimated - parameters (including RNA half-life), it additionally returns R-squared (either just for extreme data points - or all data points) as well as the log-likelihood of the fitting, which will be used for transition matrix - and velocity embedding. - Available options when the `assumption_mRNA` is 'kinetic' include: - (1) 'auto': dynamo will choose the suitable estimation method based on the `assumption_mRNA`, - `experiment_type` and `model` parameter. - (2) `twostep`: first for each time point, estimate K (1-e^{-rt}) using the total and new RNA data. Then - use regression via t-np.log(1-K) to get degradation rate gamma. When splicing and labeling data both - exist, replacing new/total with ul/u can be used to estimate beta. Suitable for velocity estimation. - (3) `direct` (default): method that directly uses the kinetic model to estimate rate parameters, - generally not good for velocity estimation. - Under `kinetic` model, choosing estimation is `experiment_type` dependent. For `kinetics` experiments, - dynamo supposes methods including RNA bursting or without RNA bursting. Dynamo also adaptively estimates - parameters, based on whether the data has splicing or without splicing. - Under `kinetic` assumption, the above method uses non-linear least square fitting. In order to return - estimated parameters (including RNA half-life), it additionally returns the log-likelihood of the - fitting, which will be used for transition matrix and velocity embedding. - All `est_method` uses least square to estimate optimal parameters with latin cubic sampler for initial - sampling. Defaults to "auto". - NTR_vel: whether to use NTR (new/total ratio) velocity for labeling datasets. Defaults to False. - group: the column key/name that identifies the grouping information (for example, clusters that correspond to - different cell types) of cells. This will be used to calculate 1/2 st moments and covariance for each cells - in each group. It will also enable estimating group-specific (i.e cell-type specific) kinetic parameters. - Defaults to None. - protein_names: a list of gene names corresponds to the rows of the measured proteins in the `X_protein` of the - `obsm` attribute. The names have to be included in the adata.var.index. Defaults to None. - concat_data: whether to concatenate data before estimation. If your data is a list of matrices for each time - point, this need to be set as True. Defaults to False. - log_unnormalized: whether to log transform the unnormalized data. Defaults to True. - one_shot_method: The method that will be used for estimating kinetic parameters for one-shot experiment data. - (1) the "sci-fate" method directly solves gamma with the first-order decay model; - (2) the "combined" model uses the linear regression under steady state to estimate relative gamma, and then - calculate absolute gamma (degradation rate), beta (splicing rate) and cell-wise alpha (transcription - rate). Defaults to "combined". - fraction_for_deg: whether to use the fraction of labeled RNA instead of the raw labeled RNA to estimate the - degradation parameter. Defaults to False. - re_smooth: whether to re-smooth the adata and also recalculate 1/2 moments or covariance. Defaults to False. - sanity_check: whether to perform sanity-check before estimating kinetic parameters and velocity vectors, - currently only applicable to kinetic or degradation metabolic labeling based scRNA-seq data. The basic idea - is that for kinetic (degradation) experiment, the total labelled RNA for each gene should increase - (decrease) over time. If they don't satisfy this criteria, those genes will be ignored during the - estimation. Defaults to False. - del_2nd_moments: whether to remove second moments or covariances. Default it is `False` so this avoids - recalculating 2nd moments or covariance but it may take a lot memory when your dataset is big. Set this to - `True` when your data is huge (like > 25, 000 cells or so) to reducing the memory footprint. Defaults to - None. - cores: number of cores to run the estimation. If cores is set to be > 1, multiprocessing will be used to - parallel the parameter estimation. Currently only applicable cases when assumption_mRNA is `ss` or cases - when experiment_type is either "one-shot" or "mix_std_stm". Defaults to 1. - tkey: the column key for the labeling time of cells in .obs. Used for labeling based scRNA-seq data. If `tkey` - is None, then `adata.uns["pp"]["tkey"]` will be checked and used if exists. Defaults to None. - **est_kwargs: Other arguments passed to the fit method (steady state models) or estimation methods (kinetic - models). + vel_N = alpha1 - csr_matrix(gamma[:, None]).multiply(u_new) + vel_T = alpha1 - csr_matrix(gamma[:, None]).multiply(S_) + else: + vel_U = np.nan + vel_S = np.nan + vel_N = vel.vel_u(U_) + vel_T = vel.vel_u(S_) # don't consider splicing + else: + vel_U = vel.vel_u(U) + vel_S = vel.vel_s(U, S) + vel_N, vel_T = np.nan, np.nan + + vel_P = vel.vel_p(S, P) + + adata = set_velocity( + adata, + vel_U, + vel_S, + vel_N, + vel_T, + vel_P, + _group, + cur_grp, + cur_cells_bools, + valid_bools_, + ind_for_proteins, + ) + + adata = set_param_ss( + adata, + est, + alpha, + beta, + gamma, + eta, + delta, + experiment_type, + _group, + cur_grp, + kin_param_pre, + valid_bools_, + ind_for_proteins, + ) + + elif assumption_mRNA.lower() == "kinetic": + return_ntr = True if fraction_for_deg and experiment_type.lower() == "deg" else False + + if model_was_auto and experiment_type.lower() == "kin": + model = "mixture" + if est_method == "auto": + est_method = "direct" + data_type = "smoothed" if use_smoothed else "sfs" + + (params, half_life, cost, logLL, param_ranges, cur_X_data, cur_X_fit_data,) = kinetic_model( + subset_adata, + tkey, + model, + est_method, + experiment_type, + has_splicing, + splicing_labeling, + has_switch=True, + param_rngs={}, + data_type=data_type, + return_ntr=return_ntr, + **est_kwargs, + ) + + if type(params) == dict: + alpha = params.pop("alpha") + params = pd.DataFrame(params) + else: + alpha = params.loc[:, "alpha"].values if "alpha" in params.columns else None + + len_t, len_g = len(np.unique(t)), len(_group) + if cur_grp == _group[0]: + if len_g != 1: + # X_data, X_fit_data = np.zeros((len_g, adata.n_vars, len_t)), np.zeros((len_g, adata.n_vars,len_t)) + X_data, X_fit_data = [None] * len_g, [None] * len_g + + if len(_group) == 1: + X_data, X_fit_data = cur_X_data, cur_X_fit_data + else: + # X_data[cur_grp_i, :, :], X_fit_data[cur_grp_i, :, :] = cur_X_data, cur_X_fit_data + X_data[cur_grp_i], X_fit_data[cur_grp_i] = ( + cur_X_data, + cur_X_fit_data, + ) + + a, b, alpha_a, alpha_i, beta, gamma = ( + params.loc[:, "a"].values if "a" in params.columns else None, + params.loc[:, "b"].values if "b" in params.columns else None, + params.loc[:, "alpha_a"].values if "alpha_a" in params.columns else None, + params.loc[:, "alpha_i"].values if "alpha_i" in params.columns else None, + params.loc[:, "beta"].values if "beta" in params.columns else None, + params.loc[:, "gamma"].values if "gamma" in params.columns else None, + ) + if alpha is None: + alpha = fbar(a, b, alpha_a, 0) if alpha_i is None else fbar(a, b, alpha_a, alpha_i) + all_kinetic_params = [ + "a", + "b", + "alpha_a", + "alpha_i", + "alpha", + "beta", + "gamma", + ] + + extra_params = params.loc[:, params.columns.difference(all_kinetic_params)] + # if alpha = None, set alpha to be U; N - gamma R + params = {"alpha": alpha, "beta": beta, "gamma": gamma, "t": t} + vel = Velocity(**params) + # Fix below: + U, S = get_U_S_for_velocity_estimation( + subset_adata, + use_smoothed, + has_splicing, + has_labeling, + log_unnormalized, + NTR_vel, + ) + + U_, S_ = get_U_S_for_velocity_estimation( + subset_adata, + use_smoothed, + has_splicing, + has_labeling, + log_unnormalized, + not NTR_vel, + ) + + # also get vel_N and vel_T + if NTR_vel: + if has_splicing: + if experiment_type == "kin": + vel_U = vel.vel_u(U_) + vel_S = vel.vel_s(U_, S_) + vel.parameters["beta"] = gamma + vel_N = vel.vel_u(U) + vel_T = vel.vel_u(S) # no need to consider splicing + elif experiment_type == "deg": + if splicing_labeling: + vel_U = np.nan + vel_S = vel.vel_s(U_, S_) + vel_N = np.nan + vel_T = np.nan + else: + vel_U = np.nan + vel_S = vel.vel_s(U_, S_) + vel_N = np.nan + vel_T = np.nan + elif experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: + vel_U = vel.vel_u(U_, repeat=True) + vel_S = vel.vel_s(U_, S_) + vel.parameters["beta"] = gamma + vel_N = vel.vel_u(U, repeat=True) + vel_T = vel.vel_u(S, repeat=True) # no need to consider splicing + else: + if experiment_type == "kin": + vel_U = np.nan + vel_S = np.nan + + # calculate cell-wise alpha, if est_method is twostep, this can be skipped + alpha_ = one_shot_alpha_matrix(U, gamma, t) + + vel.parameters["alpha"] = alpha_ + + vel_N = vel.vel_u(U) + vel_T = vel.vel_u(S) # don't consider splicing + elif experiment_type == "deg": + vel_U = np.nan + vel_S = np.nan + vel_N = np.nan + vel_T = np.nan + elif experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: + vel_U = np.nan + vel_S = np.nan + vel_N = vel.vel_u(U, repeat=True) + vel_T = vel.vel_u(S) # don't consider splicing + else: + if has_splicing: + if experiment_type == "kin": + vel_U = vel.vel_u(U) + vel_S = vel.vel_s(U, S) + vel.parameters["beta"] = gamma + vel_N = vel.vel_u(U_) + vel_T = vel.vel_u(S_) # no need to consider splicing + elif experiment_type == "deg": + if splicing_labeling: + vel_U = np.nan + vel_S = vel.vel_s(U, S) + vel_N = np.nan + vel_T = np.nan + else: + vel_U = np.nan + vel_S = vel.vel_s(U, S) + vel_N = np.nan + vel_T = np.nan + elif experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: + vel_U = vel.vel_u(U, repeat=True) + vel_S = vel.vel_s(U, S) + vel.parameters["beta"] = gamma + vel_N = vel.vel_u(U_, repeat=True) + vel_T = vel.vel_u(S_, repeat=True) # no need to consider splicing + else: + if experiment_type == "kin": + vel_U = np.nan + vel_S = np.nan + + # calculate cell-wise alpha, if est_method is twostep, this can be skipped + alpha_ = one_shot_alpha_matrix(U_, gamma, t) + + vel.parameters["alpha"] = alpha_ + + vel_N = vel.vel_u(U_) + vel_T = vel.vel_u(S_) # need to consider splicing + elif experiment_type == "deg": + vel_U = np.nan + vel_S = np.nan + vel_N = np.nan + vel_T = np.nan + elif experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: + vel_U = np.nan + vel_S = np.nan + vel_N = vel.vel_u(U_, repeat=True) + vel_T = vel.vel_u(S_, repeat=True) # don't consider splicing - Raises: - ValueError: preprocessing not performed. - Exception: No gene pass filter. - Exception: Too few valid genes. + vel_P = vel.vel_p(S, P) - Returns: - An updated AnnData object with estimated kinetic parameters, inferred velocity and estimation related - information included. The estimated kinetic parameters are currently appended to .obs (should move to .obsm with - the key `dynamics` later). Depends on the estimation method, experiment type and whether you applied estimation - for each groups via `group`, the number of returned parameters can be variable. For conventional scRNA-seq - (including cite-seq or other types of protein/RNA coassays) and somethings metabolic labeling data, the - parameters will at mostly include: - alpha: Transcription rate - beta: Splicing rate - gamma: Spliced RNA degradation rate - eta: Translation rate (only applicable to RNA/protein coassay) - delta: Protein degradation rate (only applicable to RNA/protein coassay) - alpha_b: intercept of alpha fit - beta_b: intercept of beta fit - gamma_b: intercept of gamma fit - eta_b: intercept of eta fit (only applicable to RNA/protein coassay) - delta_b: intercept of delta fit (only applicable to RNA/protein coassay) - alpha_r2: r-squared for goodness of fit of alpha estimation - beta_r2: r-squared for goodness of fit of beta estimation - gamma_r2: r-squared for goodness of fit of gamma estimation - eta_r2: r-squared for goodness of fit of eta estimation (only applicable to RNA/protein coassay) - delta_r2: r-squared for goodness of fit of delta estimation (only applicable to RNA/protein coassay) - alpha_logLL: loglikelihood of alpha estimation (only applicable to stochastic model) - beta_loggLL: loglikelihood of beta estimation (only applicable to stochastic model) - gamma_logLL: loglikelihood of gamma estimation (only applicable to stochastic model) - eta_logLL: loglikelihood of eta estimation (only applicable to stochastic model and RNA/protein coassay) - delta_loggLL: loglikelihood of delta estimation (only applicable to stochastic model and RNA/protein - coassay) - uu0: estimated amount of unspliced unlabeled RNA at time 0 (only applicable to data with both splicing - and labeling) - ul0: estimated amount of unspliced labeled RNA at time 0 (only applicable to data with both splicing - and labeling) - su0: estimated amount of spliced unlabeled RNA at time 0 (only applicable to data with both splicing - and labeling) - sl0: estimated amount of spliced labeled RNA at time 0 (only applicable to data with both splicing and - labeling) - U0: estimated amount of unspliced RNA (uu + ul) at time 0 - S0: estimated amount of spliced (su + sl) RNA at time 0 - total0: estimated amount of spliced (U + S) RNA at time 0 - half_life: Spliced mRNA's half-life (log(2) / gamma) + adata = set_velocity( + adata, + vel_U, + vel_S, + vel_N, + vel_T, + vel_P, + _group, + cur_grp, + cur_cells_bools, + valid_bools_, + ind_for_proteins, + ) - Note that all data points are used when estimating r2 although only extreme data points are used for - estimating r2. This is applicable to all estimation methods, either `linear_regression`, `gmm` or `negbin`. - By default we set the intercept to be 0. + adata = set_param_kinetic( + adata, + alpha, + a, + b, + alpha_a, + alpha_i, + beta, + gamma, + cost, + logLL, + kin_param_pre, + extra_params, + _group, + cur_grp, + cur_cells_bools, + valid_bools_, + ) + # add protein related parameters in the moment model below: + elif model.lower() == "model_selection": + main_warning("Not implemented yet.") - For metabolic labeling data, the kinetic parameters will at most include: - alpha: Transcription rate (effective - when RNA promoter switching considered) - beta: Splicing rate - gamma: Spliced RNA degradation rate - a: Switching rate from active promoter state to inactive promoter state - b: Switching rate from inactive promoter state to active promoter state - alpha_a: Transcription rate for active promoter - alpha_i: Transcription rate for inactive promoter - cost: cost of the kinetic parameters estimation - logLL: loglikelihood of kinetic parameters estimation - alpha_r2: r-squared for goodness of fit of alpha estimation - beta_r2: r-squared for goodness of fit of beta estimation - gamma_r2: r-squared for goodness of fit of gamma estimation - uu0: estimated amount of unspliced unlabeled RNA at time 0 (only applicable to data with both splicing - and labeling) - ul0: estimated amount of unspliced labeled RNA at time 0 (only applicable to data with both splicing - and labeling) - su0: estimated amount of spliced unlabeled RNA at time 0 (only applicable to data with both splicing - and labeling) - sl0: estimated amount of spliced labeled RNA at time 0 (only applicable to data with both splicing and - labeling) - u0: estimated amount of unspliced RNA (including uu, ul) at time 0 - s0: estimated amount of spliced (including su, sl) RNA at time 0 - total0: estimated amount of spliced (including U, S) RNA at time 0 - p_half_life: half-life for unspliced mRNA - half_life: half-life for spliced mRNA + if group is not None and group in adata.obs[group]: + uns_key = group + "_dynamics" + else: + uns_key = "dynamics" + + if sanity_check and experiment_type in ["kin", "deg"]: + sanity_check_cols = adata.var.columns.str.endswith("sanity_check") + adata.var["use_for_dynamics"] = adata.var.loc[:, sanity_check_cols].sum(1).astype(bool) + else: + adata.var["use_for_dynamics"] = False + adata.var.loc[valid_bools, "use_for_dynamics"] = True + + adata.uns[uns_key] = { + "filter_gene_mode": filter_gene_mode, + "t": t, + "group": group, + "X_data": X_data, + "X_fit_data": X_fit_data, + "asspt_mRNA": assumption_mRNA, + "experiment_type": experiment_type, + "normalized": normalized, + "model": model, + "est_method": est_method, + "has_splicing": has_splicing, + "has_labeling": has_labeling, + "splicing_labeling": splicing_labeling, + "has_protein": has_protein, + "use_smoothed": use_smoothed, + "NTR_vel": NTR_vel, + "log_unnormalized": log_unnormalized, + "fraction_for_deg": fraction_for_deg, + } + + if del_2nd_moments: + remove_2nd_moments(adata) + + return adata + + +class KineticEstimation: + def __init__( + self, + subset_adata: AnnData, + tkey: str, + model: Literal["auto", "deterministic", "stochastic"], + est_method: Literal["twostep", "direct", "storm-csp", "storm-cszip", "storm-icsp"], + experiment_type: str, + has_splicing: bool, + splicing_labeling: bool, + has_switch: bool, + param_rngs: Dict[str, List[int]], + data_type: Literal["smoothed", "sfs"] = "sfs", + return_ntr: bool = False, + **est_kwargs, + ): + self.subset_data = subset_adata + self.tkey = tkey + self.model = model + self.est_method = est_method + self.experiment_type = experiment_type + self.has_splicing = has_splicing + self.splicing_labeling = splicing_labeling + self.has_switch = has_switch + self.param_rngs = param_rngs + self.data_type = data_type + self.return_ntr = return_ntr + self.est_kwargs = est_kwargs + self.time = subset_adata.obs[tkey].astype("float").values + + def fit_twostep_kinetics(self): + if self.has_splicing: + layers = ( + ["M_u", "M_s", "M_t", "M_n"] + if ("M_u" in self.subset_adata.layers.keys() and self.data_type == "smoothed") + else ["X_u", "X_s", "X_t", "X_n"] + ) + U, S, Total, New = ( + self.subset_adata.layers[layers[0]].T, + self.subset_adata.layers[layers[1]].T, + self.subset_adata.layers[layers[2]].T, + self.subset_adata.layers[layers[3]].T, + ) + US, S2 = ( + self.subset_adata.layers["M_us"].T, + self.subset_adata.layers["M_ss"].T, + ) + # gamma, gamma_r2 = lin_reg_gamma_synthesis(U, Ul, time, perc_right=100) + ( + gamma_k, + gamma_b, + gamma_all_r2, + gamma_all_logLL, + ) = fit_slope_stochastic(S, U, US, S2, perc_left=None, perc_right=100) + ( + gamma, + gamma_r2, + X_data, + mean_R2, + K_fit, + ) = lin_reg_gamma_synthesis(Total, New, self.time, perc_right=100) + + k = 1 - np.exp(-gamma[:, None] * self.time[None, :]) + beta = gamma / gamma_k # gamma_k = gamma / beta + + Estm_df = { + "alpha": csr_matrix(gamma[:, None]).multiply(New).multiply(1 / k), + "beta": beta, + "gamma_k": gamma_k, + "gamma_b": gamma_b, + "gamma_k_r2": gamma_all_r2, + "gamma_logLL": gamma_all_logLL, + "gamma": gamma, + "gamma_r2": gamma_r2, + "mean_R2": mean_R2, + } + half_life = np.log(2) / gamma + cost, logLL, _param_ranges, X_data, X_fit_data = ( + None, + None, + None, + X_data, + K_fit, + ) + + return ( + Estm_df, + half_life, + cost, + logLL, + _param_ranges, + X_data, + X_fit_data, + ) + else: + layers = ( + ["M_t", "M_n"] + if ("M_t" in self.subset_adata.layers.keys() and self.data_type == "smoothed") + else ["X_t", "X_n"] + ) + Total, New = ( + self.subset_adata.layers[layers[0]].T, + self.subset_adata.layers[layers[1]].T, + ) + ( + gamma, + gamma_r2, + X_data, + mean_R2, + K_fit, + ) = lin_reg_gamma_synthesis(Total, New, self.time, perc_right=100) + + k = 1 - np.exp(-gamma[:, None] * self.time[None, :]) + Estm_df = { + "alpha": csr_matrix(gamma[:, None]).multiply(New).multiply(1 / k), + "gamma": gamma, + "gamma_k": gamma, # required for phase_potrait + "gamma_r2": gamma_r2, + "mean_R2": mean_R2, + } + half_life = np.log(2) / gamma + cost, logLL, _param_ranges, X_data, X_fit_data = ( + None, + None, + None, + X_data, + K_fit, + ) + + return ( + Estm_df, + half_life, + cost, + logLL, + _param_ranges, + X_data, + X_fit_data, + ) + def fit_storm(self): + if self.has_splicing: + # Initialization based on the steady-state assumption + layers_smoothed = ["M_u", "M_s", "M_t", "M_n"] + U_smoothed, S_smoothed, Total_smoothed, New_smoothed = ( + self.subset_adata.layers[layers_smoothed[0]].T, + self.subset_adata.layers[layers_smoothed[1]].T, + self.subset_adata.layers[layers_smoothed[2]].T, + self.subset_adata.layers[layers_smoothed[3]].T, + ) + + US_smoothed, S2_smoothed = ( + self.subset_adata.layers["M_us"].T, + self.subset_adata.layers["M_ss"].T, + ) + (gamma_k, _, _, _,) = fit_slope_stochastic(S_smoothed, U_smoothed, US_smoothed, S2_smoothed, + perc_left=None, perc_right=5) + (gamma_init, _, _, _, _) = lin_reg_gamma_synthesis(Total_smoothed, New_smoothed, self.time, perc_right=5) + beta_init = gamma_init / gamma_k # gamma_k = gamma / beta - If sanity_check has performed, a column with key `sanity_check` will also included which indicates which - gene passes filter (`filter_gene_mode`) and sanity check. This is only applicable to kinetic and degradation - metabolic labeling experiments. + # Read raw counts + layers_raw = ["ul", "sl"] + UL_raw, SL_raw = ( + self.subset_adata.layers[layers_raw[0]].T, + self.subset_adata.layers[layers_raw[1]].T, + ) - In addition, the `dynamics` key of the .uns attribute corresponds to a dictionary that includes the - following keys: - t: An array like object that indicates the time point of each cell used during parameters estimation - (applicable only to kinetic models) - group: The group that you used to estimate parameters group-wise - X_data: The input that was used for estimating parameters (applicable only to kinetic models) - X_fit_data: The data that was fitted during parameters estimation (applicable only to kinetic models) - asspt_mRNA: Assumption of mRNA dynamics (steady state or kinetic) - experiment_type: Experiment type (either conventional or metabolic labeling based) - normalized: Whether to normalize data - model: Model used for the parameter estimation (either auto, deterministic or stochastic) - has_splicing: Does the adata has splicing? detected automatically - has_labeling: Does the adata has labelling? detected automatically - has_protein: Does the adata has protein information? detected automatically - use_smoothed: Whether to use smoothed data (or first moment, done via local average of neighbor cells) - NTR_vel: Whether to estimate NTR velocity - log_unnormalized: Whether to log transform unnormalized data. - """ + # Read smoothed values based CSP type distribution for cell-specific parameter inference + UL_smoothed_CSP, SL_smoothed_CSP = ( + self.subset_adata.layers['M_CSP_ul'].T, + self.subset_adata.layers['M_CSP_sl'].T, + ) - del_2nd_moments = DynamoAdataConfig.use_default_var_if_none( - del_2nd_moments, DynamoAdataConfig.DYNAMICS_DEL_2ND_MOMENTS_KEY - ) - if "pp" not in adata.uns_keys(): - raise ValueError(f"\nPlease run `dyn.pp.receipe_monocle(adata)` before running this function!") - if tkey is None: - tkey = adata.uns["pp"]["tkey"] - (experiment_type, has_splicing, has_labeling, splicing_labeling, has_protein,) = ( - adata.uns["pp"]["experiment_type"], - adata.uns["pp"]["has_splicing"], - adata.uns["pp"]["has_labeling"], - adata.uns["pp"]["splicing_labeling"], - adata.uns["pp"]["has_protein"], - ) + # Parameters inference based on maximum likelihood estimation + cell_total = self.subset_adata.obs['initial_cell_size'].astype("float").values + # Independent cell-specific Poisson + (gamma_s, gamma_r2, beta, gamma_t, gamma_r2_raw, alpha) = storm.mle_independent_cell_specific_poisson \ + (UL_raw, SL_raw, self.time, gamma_init, beta_init, cell_total, Total_smoothed, S_smoothed) + gamma_k = gamma_s / beta + gamma_b = np.zeros_like(gamma_k) - X_data, X_fit_data = None, None - filter_list, filter_gene_mode_list = ( - [ - "use_for_pca", - "pass_basic_filter", - "no", - ], - ["final", "basic", "no"], - ) - filter_checker = [i in adata.var.columns for i in filter_list[:2]] - filter_checker.append(True) - filter_id = filter_gene_mode_list.index(filter_gene_mode) - which_filter = np.where(filter_checker[filter_id:])[0][0] + filter_id + # Cell specific parameters (fixed gamma_s) + alpha, beta = storm.cell_specific_alpha_beta(UL_smoothed_CSP, SL_smoothed_CSP, self.time, gamma_s, beta) - filter_gene_mode = filter_gene_mode_list[which_filter] + # # Cell specific parameters(fixed gamma_t) + # k = 1 - np.exp(-gamma_t[:, None] * time[None, :]) + # alpha = csr_matrix(gamma_t[:, None]).multiply(UL_smoothed_CSP+SL_smoothed_CSP).multiply(1 / k) - valid_bools = get_valid_bools(adata, filter_gene_mode) - gene_num = sum(valid_bools) - if gene_num == 0: - raise Exception(f"no genes pass filter. Try resetting `filter_gene_mode = 'no'` to use all genes.") + Estm_df = { + "alpha": alpha, + "beta": beta, + "gamma_k": gamma_k, + "gamma_b": gamma_b, + # "gamma_k_r2": gamma_all_r2, + # "gamma_logLL": gamma_all_logLL, + "gamma": gamma_s, + "gamma_r2": gamma_r2, + # "mean_R2": mean_R2, + "gamma_t": gamma_t, + "gamma_r2_raw": gamma_r2_raw, + } + half_life = np.log(2) / gamma_s + cost, logLL, _param_ranges, X_data, X_fit_data = ( + None, + None, + None, + None, + None, + ) - if model.lower() == "auto": - model = "stochastic" - model_was_auto = True - else: - model_was_auto = False + return ( + Estm_df, + half_life, + cost, + logLL, + _param_ranges, + X_data, + X_fit_data, + ) + else: + # Initialization based on the steady-state assumption + layers_smoothed = ["M_t", "M_n"] + Total_smoothed, New_smoothed = ( + self.subset_adata.layers[layers_smoothed[0]].T, + self.subset_adata.layers[layers_smoothed[1]].T, + ) + (gamma_init, _, _, _, _,) = lin_reg_gamma_synthesis(Total_smoothed, New_smoothed, time, + perc_right=5) - if tkey is not None: - if adata.obs[tkey].max() > 60: - main_warning( - "Looks like you are using minutes as the time unit. For the purpose of numeric stability, " - "we recommend using hour as the time unit." + # Read raw counts + layers_raw = ["total", "new"] + Total_raw, New_raw = ( + self.subset_adata.layers[layers_raw[0]].T, + self.subset_adata.layers[layers_raw[1]].T, ) - if model.lower() == "stochastic" or use_smoothed or re_smooth: - M_layers = [i for i in adata.layers.keys() if i.startswith("M_")] + # Read smoothed values based CSP type distribution for cell-specific parameter inference + layers_smoothed_CSP = ["M_CSP_t", "M_CSP_n"] + Total_smoothed_CSP, New_smoothed_CSP = ( + self.subset_adata.layers[layers_smoothed_CSP[0]].T, + self.subset_adata.layers[layers_smoothed_CSP[1]].T, + ) - if len(M_layers) < 2 or re_smooth: - main_info("removing existing M layers:%s..." % (str(list(M_layers))), indent_level=2) - for i in M_layers: - del adata.layers[i] - main_info("making adata smooth...", indent_level=2) + # Parameters inference based on maximum likelihood estimation + cell_total = self.subset_adata.obs['initial_cell_size'].astype("float").values - if group is not None and group in adata.obs.columns: - moments(adata, genes=valid_bools, group=group) + if "storm-csp" == self.est_method: + gamma, gamma_r2, gamma_r2_raw, alpha = storm.mle_cell_specific_poisson(New_raw, time, + gamma_init, cell_total) + elif "storm-cszip" == self.est_method: + gamma, prob_off, gamma_r2, gamma_r2_raw, alpha = storm.mle_cell_specific_zero_inflated_poisson( + New_raw, self.time, gamma_init, cell_total) + alpha = alpha * (1 - prob_off) # gene-wise alpha else: - moments(adata, genes=valid_bools, group=tkey) - elif tkey is not None: - main_warning( - f"You used tkey {tkey} (or group {group}), but you have calculated local smoothing (1st moment) " - f"for your data before. Please ensure you used the desired tkey or group when the smoothing was " - f"performed. Try setting re_smooth = True if not sure." + raise NotImplementedError("This method has not been implemented.") + + k = 1 - np.exp(-gamma[:, None] * self.time[None, :]) + alpha = csr_matrix(gamma[:, None]).multiply(New_smoothed_CSP).multiply(1 / k) # gene-cell-wise alpha + + Estm_df = { + "alpha": alpha, + "gamma": gamma, + "gamma_k": gamma, # required for phase_potrait + "gamma_r2": gamma_r2, + "gamma_r2_raw": gamma_r2_raw, + # "mean_R2": mean_R2, + "prob_off": prob_off if "cszip" in self.est_method else None + } + half_life = np.log(2) / gamma + cost, logLL, _param_ranges, X_data, X_fit_data = ( + None, + None, + None, + None, # X_data, + None, # K_fit, ) - valid_adata = adata[:, valid_bools].copy() - if group is not None and group in adata.obs.columns: - _group = adata.obs[group].unique() - if any(adata.obs[group].value_counts() < 50): - main_warning( - f"Note that some groups have less than 50 cells, this may lead to the velocities for some " - f"cells are all NaN values and cause issues for all downstream analysis. Please try to " - f"coarse-grain cell groupings. Cell number for each group are {adata.obs[group].value_counts()}" + return ( + Estm_df, + half_life, + cost, + logLL, + _param_ranges, + X_data, + X_fit_data, ) - else: - _group = ["_all_cells"] + def fit_direct_kinetics(self): + if self.has_splicing and self.splicing_labeling: + layers = ( + ["M_ul", "M_sl", "M_uu", "M_su"] + if ("M_ul" in self.subset_adata.layers.keys() and self.data_type == "smoothed") + else ["X_ul", "X_sl", "X_uu", "X_su"] + ) - for cur_grp_i, cur_grp in enumerate(_group): - if cur_grp == "_all_cells": - kin_param_pre = "" - cur_cells_bools = np.ones(valid_adata.shape[0], dtype=bool) - subset_adata = valid_adata[cur_cells_bools] - else: - kin_param_pre = str(group) + "_" + str(cur_grp) + "_" - cur_cells_bools = (valid_adata.obs[group] == cur_grp).values - subset_adata = valid_adata[cur_cells_bools] + if self.model.lower() in ["deterministic", "stochastic"]: + layer_u = "M_ul" if ("M_ul" in self.subset_adata.layers.keys() and self.data_type == "smoothed") else "X_ul" + layer_s = "M_sl" if ("M_ul" in self.subset_adata.layers.keys() and self.data_type == "smoothed") else "X_sl" - if model.lower() == "stochastic" or use_smoothed: - moments(subset_adata) - ( - U, - Ul, - S, - Sl, - P, - US, - U2, - S2, - t, - normalized, - ind_for_proteins, - assump_mRNA, - ) = get_data_for_kin_params_estimation( - subset_adata, - has_splicing, - has_labeling, - model, - use_smoothed, - tkey, - protein_names, - log_unnormalized, - NTR_vel, - ) + X, X_raw = prepare_data_has_splicing( + self.subset_adata, + self.subset_adata.var.index, + self.time, + layer_u=layer_u, + layer_s=layer_s, + total_layers=layers, + ) + elif self.model.startswith("mixture"): + X, _, X_raw = prepare_data_deterministic( + self.subset_adata, + self.subset_adata.var.index, + self.time, + layers=layers, + total_layers=layers, + ) + + if self.model.lower() == "deterministic": + X = [X[i][[0, 1], :] for i in range(len(X))] + _param_ranges = { + "alpha": [0, 1000], + "beta": [0, 1000], + "gamma": [0, 1000], + } + x0 = {"u0": [0, 1000], "s0": [0, 1000]} + Est, _ = Estimation_DeterministicKin, Deterministic + elif self.model.lower() == "stochastic": + x0 = { + "u0": [0, 1000], + "s0": [0, 1000], + "uu0": [0, 1000], + "ss0": [0, 1000], + "us0": [0, 1000], + } - valid_bools_ = valid_bools.copy() - if sanity_check and experiment_type.lower() in ["kin", "deg"]: - indices_valid_bools = np.where(valid_bools)[0] - t, L = ( - t.flatten(), - (0 if Ul is None else Ul) + (0 if Sl is None else Sl), - ) - t_uniq = np.unique(t) + if self.has_switch: + _param_ranges = { + "a": [0, 1000], + "b": [0, 1000], + "alpha_a": [0, 1000], + "alpha_i": 0, + "beta": [0, 1000], + "gamma": [0, 1000], + } + Est, _ = Estimation_MomentKin, Moments + else: + _param_ranges = { + "alpha": [0, 1000], + "beta": [0, 1000], + "gamma": [0, 1000], + } - valid_gene_checker = np.zeros(gene_num, dtype=bool) - for L_iter, cur_L in tqdm( - enumerate(L), - desc=f"sanity check of {experiment_type} experiment data:", - ): - cur_L = cur_L.A.flatten() if issparse(cur_L) else cur_L.flatten() - y = strat_mom(cur_L, t, np.nanmean) - slope, _ = fit_linreg(t_uniq, y, intercept=True, r2=False) - valid_gene_checker[L_iter] = ( - True - if (slope > 0 and experiment_type == "kin") or (slope < 0 and experiment_type == "deg") - else False + Est, _ = ( + Estimation_MomentKinNoSwitch, + Moments_NoSwitching, + ) + elif self.model.lower() == "mixture": + _param_ranges = { + "alpha": [0, 1000], + "alpha_2": [0, 0], + "beta": [0, 1000], + "gamma": [0, 1000], + } + x0 = { + "ul0": [0, 0], + "sl0": [0, 0], + "uu0": [0, 1000], + "su0": [0, 1000], + } + + Est = Mixture_KinDeg_NoSwitching(Deterministic(), Deterministic()) + elif self.model.lower() == "mixture_deterministic_stochastic": + X, X_raw = prepare_data_mix_has_splicing( + self.subset_adata, + self.subset_adata.var.index, + self.time, + layer_u=layers[2], + layer_s=layers[3], + layer_ul=layers[0], + layer_sl=layers[1], + total_layers=layers, + mix_model_indices=[0, 1, 5, 6, 7, 8, 9], ) - valid_bools_[indices_valid_bools[~valid_gene_checker]] = False - main_warning(f"filtering {gene_num - valid_gene_checker.sum()} genes after sanity check.") - if len(valid_bools_) < 5: - raise Exception( - f"After sanity check, you have less than 5 valid genes. Something is wrong about your " - f"metabolic labeling experiment!" + _param_ranges = { + "alpha": [0, 1000], + "alpha_2": [0, 0], + "beta": [0, 1000], + "gamma": [0, 1000], + } + x0 = { + "ul0": [0, 0], + "sl0": [0, 0], + "u0": [0, 1000], + "s0": [0, 1000], + "uu0": [0, 1000], + "ss0": [0, 1000], + "us0": [0, 1000], + } + Est = Mixture_KinDeg_NoSwitching(Deterministic(), Moments_NoSwitching()) + elif self.model.lower() == "mixture_stochastic_stochastic": + _param_ranges = { + "alpha": [0, 1000], + "alpha_2": [0, 0], + "beta": [0, 1000], + "gamma": [0, 1000], + } + X, X_raw = prepare_data_mix_has_splicing( + self.subset_adata, + self.subset_adata.var.index, + self.time, + layer_u=layers[2], + layer_s=layers[3], + layer_ul=layers[0], + layer_sl=layers[1], + total_layers=layers, + mix_model_indices=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9], + ) + x0 = { + "ul0": [0, 1000], + "sl0": [0, 1000], + "ul_ul0": [0, 1000], + "sl_sl0": [0, 1000], + "ul_sl0": [0, 1000], + "u0": [0, 1000], + "s0": [0, 1000], + "uu0": [0, 1000], + "ss0": [0, 1000], + "us0": [0, 1000], + } + Est = Mixture_KinDeg_NoSwitching(Moments_NoSwitching(), Moments_NoSwitching()) + else: + raise NotImplementedError( + f"model {self.model} with kinetic assumption is not implemented. " + f"current supported models for kinetics experiments include: stochastic, deterministic, mixture," + f"mixture_deterministic_stochastic or mixture_stochastic_stochastic" ) + else: + total_layer = "M_t" if ("M_t" in self.subset_adata.layers.keys() and self.data_type == "smoothed") else "X_total" - U, Ul, S, Sl = ( - (None if U is None else U[valid_gene_checker, :]), - (None if Ul is None else Ul[valid_gene_checker, :]), - (None if S is None else S[valid_gene_checker, :]), - (None if Sl is None else Sl[valid_gene_checker, :]), - ) - subset_adata = subset_adata[:, valid_gene_checker] - adata.var[kin_param_pre + "sanity_check"] = valid_bools_ + if self.model.lower() in ["deterministic", "stochastic"]: + layer = "M_n" if ("M_n" in self.subset_adata.layers.keys() and self.data_type == "smoothed") else "X_new" + X, X_raw = prepare_data_no_splicing( + self.subset_adata, + self.subset_adata.var.index, + self.time, + layer=layer, + total_layer=total_layer, + ) + elif self.model.lower().startswith("mixture"): + layers = ( + ["M_n", "M_t"] + if ("M_n" in self.subset_adata.layers.keys() and self.data_type == "smoothed") + else ["X_new", "X_total"] + ) - if assumption_mRNA.lower() == "auto": - assumption_mRNA = assump_mRNA - if experiment_type.lower() == "conventional": - assumption_mRNA = "ss" - elif experiment_type.lower() in ["mix_pulse_chase", "deg", "kin"]: - assumption_mRNA = "kinetic" + X, _, X_raw = prepare_data_deterministic( + self.subset_adata, + self.subset_adata.var.index, + self.time, + layers=layers, + total_layers=total_layer, + ) - if model.lower() == "stochastic" and experiment_type.lower() not in [ - "conventional", - "kinetics", - "degradation", - "kin", - "deg", - "one-shot", - ]: - """ - # temporially convert to deterministic model as moment model for mix_std_stm - and other types of labeling experiment is ongoing.""" + if self.model.lower() == "deterministic": + X = [X[i][0, :] for i in range(len(X))] + _param_ranges = { + "alpha": [0, 1000], + "gamma": [0, 1000], + } + x0 = {"u0": [0, 1000]} + Est, _ = ( + Estimation_DeterministicKinNosp, + Deterministic_NoSplicing, + ) + elif self.model.lower() == "stochastic": + x0 = { + "u0": [0, 1000], + "uu0": [0, 1000], + } + if self.has_switch: + _param_ranges = { + "a": [0, 1000], + "b": [0, 1000], + "alpha_a": [0, 1000], + "alpha_i": 0, + "gamma": [0, 1000], + } + Est, _ = Estimation_MomentKinNosp, Moments_Nosplicing + else: + _param_ranges = { + "alpha": [0, 1000], + "gamma": [0, 1000], + } + Est, _ = ( + Estimation_MomentKinNoSwitchNoSplicing, + Moments_NoSwitchingNoSplicing, + ) + elif self.model.lower() == "mixture": + _param_ranges = { + "alpha": [0, 1000], + "alpha_2": [0, 0], + "gamma": [0, 1000], + } + x0 = {"u0": [0, 0], "o0": [0, 1000]} + Est = Mixture_KinDeg_NoSwitching(Deterministic_NoSplicing(), Deterministic_NoSplicing()) + elif self.model.lower() == "mixture_deterministic_stochastic": + X, X_raw = prepare_data_mix_no_splicing( + self.subset_adata, + self.subset_adata.var.index, + self.time, + layer_n=layers[0], + layer_t=layers[1], + total_layer=total_layer, + mix_model_indices=[0, 2, 3], + ) - model = "deterministic" + _param_ranges = { + "alpha": [0, 1000], + "alpha_2": [0, 0], + "gamma": [0, 1000], + } + x0 = {"u0": [0, 1000], "o0": [0, 1000], "oo0": [0, 1000]} + Est = Mixture_KinDeg_NoSwitching( + Deterministic_NoSplicing(), + Moments_NoSwitchingNoSplicing(), + ) + elif self.model.lower() == "mixture_stochastic_stochastic": + X, X_raw = prepare_data_mix_no_splicing( + self.subset_adata, + self.subset_adata.var.index, + self.time, + layer_n=layers[0], + layer_t=layers[1], + total_layer=total_layer, + mix_model_indices=[0, 1, 2, 3], + ) - if model_was_auto and experiment_type.lower() in [ - "kinetic", - "kin", - "degradation", - "deg", - ]: - model = "deterministic" + _param_ranges = { + "alpha": [0, 1000], + "alpha_2": [0, 0], + "gamma": [0, 1000], + } + x0 = { + "u0": [0, 1000], + "uu0": [0, 1000], + "o0": [0, 1000], + "oo0": [0, 1000], + } + Est = Mixture_KinDeg_NoSwitching( + Moments_NoSwitchingNoSplicing(), + Moments_NoSwitchingNoSplicing(), + ) + else: + raise NotImplementedError( + f"model {self.model} with kinetic assumption is not implemented. " + f"current supported models for kinetics experiments include: stochastic, deterministic, " + f"mixture, mixture_deterministic_stochastic or mixture_stochastic_stochastic" + ) + _param_ranges = update_dict(_param_ranges, self.param_rngs) + x0_ = np.vstack([ran for ran in x0.values()]).T + + n_genes = self.subset_adata.n_vars + cost, logLL = np.zeros(n_genes), np.zeros(n_genes) + all_keys = list(_param_ranges.keys()) + list(x0.keys()) + all_keys = [cur_key for cur_key in all_keys if cur_key != "alpha_i"] + half_life, Estm = np.zeros(n_genes), [None] * n_genes + X_data, X_fit_data = [None] * n_genes, [None] * n_genes + if self.experiment_type: + popt = [None] * n_genes - if assumption_mRNA.lower() == "ss" or (experiment_type.lower() in ["one-shot", "mix_std_stm"]): - if est_method.lower() == "auto": - est_method = "gmm" if model.lower() == "stochastic" else "ols" + main_debug("model: %s, experiment_type: %s" % (self.model, self.experiment_type)) + for i_gene in tqdm(range(n_genes), desc="estimating kinetic-parameters using kinetic model"): + if self.model.lower().startswith("mixture"): + estm = Est + if self.model.lower() == "mixture": + cur_X_data = np.vstack([X[i_layer][i_gene] for i_layer in range(len(X))]) + if issparse(X_raw[0]): + cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene].A for i_layer in range(len(X))]) + else: + cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene] for i_layer in range(len(X))]) + else: + cur_X_data = X[i_gene] + cur_X_raw = X_raw[i_gene] - if experiment_type.lower() == "one-shot": - beta = subset_adata.var.beta if "beta" in subset_adata.var.keys() else None - gamma = subset_adata.var.gamma if "gamma" in subset_adata.var.keys() else None - ss_estimation_kwargs = {"beta": beta, "gamma": gamma} + if issparse(cur_X_raw[0, 0]): + cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) + + _, cost[i_gene] = estm.auto_fit(np.unique(self.time), cur_X_data) + ( + model_1, + model_2, + kinetic_parameters, + mix_x0, + ) = estm.export_dictionary().values() + tmp = list(kinetic_parameters.values()) + tmp.extend(mix_x0) + Estm[i_gene] = tmp else: - ss_estimation_kwargs = {} + cur_X_data, cur_X_raw = X[i_gene], X_raw[i_gene] - est = ss_estimation( - U=U.copy() if U is not None else None, - Ul=Ul.copy() if Ul is not None else None, - S=S.copy() if S is not None else None, - Sl=Sl.copy() if Sl is not None else None, - P=P.copy() if P is not None else None, - US=US.copy() if US is not None else None, - S2=S2.copy() if S2 is not None else None, - conn=subset_adata.obsp["moments_con"], - t=t, - ind_for_proteins=ind_for_proteins, - model=model, - est_method=est_method, - experiment_type=experiment_type, - assumption_mRNA=assumption_mRNA, - assumption_protein=assumption_protein, - concat_data=concat_data, - cores=cores, - **ss_estimation_kwargs, - ) # U: (unlabeled) unspliced; S: (unlabeled) spliced; U / Ul: old and labeled; U, Ul, S, Sl: uu/ul/su/sl + if self.has_splicing: + alpha0 = guestimate_alpha(np.sum(cur_X_data, 0), np.unique(self.time)) + else: + alpha0 = ( + guestimate_alpha(cur_X_data, np.unique(time)) + if cur_X_data.ndim == 1 + else guestimate_alpha(cur_X_data[0], np.unique(time)) + ) - with warnings.catch_warnings(): - warnings.simplefilter("ignore") + if self.model.lower() == "stochastic": + _param_ranges.update({"alpha_a": [0, alpha0 * 10]}) + elif self.model.lower() == "deterministic": + _param_ranges.update({"alpha": [0, alpha0 * 10]}) + param_ranges = [ran for ran in _param_ranges.values()] - if experiment_type.lower() in ["one-shot", "one_shot"]: - est.fit(one_shot_method=one_shot_method, **est_kwargs) + estm = Est(*param_ranges, x0=x0_) if "x0" in inspect.getfullargspec(Est) else Est(*param_ranges) + _, cost[i_gene] = estm.fit_lsq(np.unique(self.time), cur_X_data, **self.est_kwargs) + if self.model.lower() == "deterministic": + Estm[i_gene] = estm.export_parameters() else: - # experiment_type can be `kin` also and by default use - # conventional method to estimate k but correct for time - est.fit(**est_kwargs) - - alpha, beta, gamma, eta, delta = est.parameters.values() + tmp = np.ma.array(estm.export_parameters(), mask=False) + tmp.mask[3] = True + Estm[i_gene] = tmp.compressed() - U, S = get_U_S_for_velocity_estimation( - subset_adata, - use_smoothed, - has_splicing, - has_labeling, - log_unnormalized, - NTR_vel, - ) - vel = Velocity(estimation=est) + if issparse(cur_X_raw[0, 0]): + cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) - if experiment_type.lower() in [ - "one_shot", - "one-shot", - "kin", - "mix_std_stm", - ]: - U_, S_ = get_U_S_for_velocity_estimation( - subset_adata, - use_smoothed, - has_splicing, - has_labeling, - log_unnormalized, - not NTR_vel, - ) + X_data[i_gene] = cur_X_data + if self.model.lower().startswith("mixture"): + X_fit_data[i_gene] = estm.simulator.x.T + X_fit_data[i_gene][estm.model1.n_species:] *= estm.scale + else: + if hasattr(estm, "extract_data_from_simulator"): + X_fit_data[i_gene] = estm.extract_data_from_simulator() + else: + X_fit_data[i_gene] = estm.simulator.x.T - # also get vel_N and vel_T - if NTR_vel: - if has_splicing: - if experiment_type == "kin": - Kc = np.clip(gamma[:, None], 0, 1 - 1e-3) # S - U slope - gamma_ = -(np.log(1 - Kc) / t[None, :]) # actual gamma + half_life[i_gene] = np.log(2) / Estm[i_gene][-1] - vel_U = U.multiply(csr_matrix(gamma_ / Kc)) - csr_matrix(beta).multiply(U_) # vel.vel_s(U_) - vel_S = vel.vel_s(U_, S_) + if self.model.lower().startswith("mixture"): + species = [0, 1, 2, 3] if self.has_splicing else [0, 1] + gof = GoodnessOfFit(estm.export_model(), params=estm.export_parameters()) + gof.prepare_data(self.time, cur_X_raw.T, species=species, normalize=True) + else: + gof = GoodnessOfFit( + estm.export_model(), + params=estm.export_parameters(), + x0=estm.simulator.x0, + ) + gof.prepare_data(self.time, cur_X_raw.T, normalize=True) - vel_N = (U - csr_matrix(Kc).multiply(U)).multiply(csr_matrix(gamma_ / Kc)) # vel.vel_u(U) - # scale back to true velocity via multiplying "gamma_ / Kc". - vel_T = (U - csr_matrix(Kc).multiply(S)).multiply(csr_matrix(gamma_ / Kc)) - elif experiment_type == "mix_std_stm": - # steady state RNA: u0, stimulation RNA: u_new; - # cell-wise transcription rate under simulation: alpha1 - u0, u_new, alpha1 = solve_alpha_2p_mat( - t0=np.max(t) - t, - t1=t, - alpha0=alpha[0], - beta=beta, - u1=U, - ) - vel_U = alpha1 - csr_matrix(beta[:, None]).multiply(U_) - vel_S = vel.vel_s(U_, S_) + logLL[i_gene] = gof.calc_mean_squared_deviation() # .calc_gaussian_loglikelihood() - vel_N = alpha1 - csr_matrix(gamma[:, None]).multiply(u_new) - vel_T = alpha1 - csr_matrix(beta[:, None]).multiply(S) - else: - vel_U = vel.vel_u(U_) - vel_S = vel.vel_s(U_, S_) - vel_N = vel.vel_u(U) - vel_T = vel.vel_s(U, S - U) # need to consider splicing - else: - if experiment_type == "kin": - vel_U = np.nan - vel_S = np.nan + Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) - Kc = np.clip(gamma[:, None], 0, 1 - 1e-3) # S - U slope - gamma_ = -(np.log(1 - Kc) / t[None, :]) # actual gamma - vel_N = (U - csr_matrix(Kc).multiply(U)).multiply(csr_matrix(gamma_ / Kc)) # vel.vel_u(U) - # scale back to true velocity via multiplying "gamma_ / Kc". - vel_T = (U - csr_matrix(Kc).multiply(S)).multiply(csr_matrix(gamma_ / Kc)) - elif experiment_type == "mix_std_stm": - vel_U = np.nan - vel_S = np.nan + return Estm_df, half_life, cost, logLL, _param_ranges, X_data, X_fit_data - # steady state RNA: u0, stimulation RNA: u_new; - # cell-wise transcription rate under simulation: alpha1 - u0, u_new, alpha1 = solve_alpha_2p_mat( - t0=np.max(t) - t, - t1=t, - alpha0=alpha[0], - beta=gamma, - u1=U, - ) + def fit_degradation(self): + if self.has_splicing and self.splicing_labeling: + layers = ( + ["M_ul", "M_sl", "M_uu", "M_su"] + if ("M_ul" in self.subset_adata.layers.keys() and self.data_type == "smoothed") + else ["X_ul", "X_sl", "X_uu", "X_su"] + ) - vel_N = alpha1 - csr_matrix(gamma[:, None]).multiply(u_new) - vel_T = alpha1 - csr_matrix(gamma[:, None]).multiply(S) - else: - vel_U = np.nan - vel_S = np.nan - vel_N = vel.vel_u(U) - vel_T = vel.vel_u(S) # don't consider splicing - else: - if has_splicing: - if experiment_type == "kin": - Kc = np.clip(gamma[:, None], 0, 1 - 1e-3) # S - U slope - gamma_ = -(np.log(1 - Kc) / t[None, :]) # actual gamma + if self.model.lower() in ["deterministic", "stochastic"]: + layer_u = "M_ul" if ("M_ul" in self.subset_adata.layers.keys() and self.data_type == "smoothed") else "X_ul" + layer_s = "M_sl" if ("M_sl" in self.subset_adata.layers.keys() and self.data_type == "smoothed") else "X_sl" - vel_U = U_.multiply(csr_matrix(gamma_ / Kc) - csr_matrix(beta).multiply(U)) # vel.vel_u(U) - vel_S = vel.vel_s(U, S) + X, X_raw = prepare_data_has_splicing( + self.subset_adata, + self.subset_adata.var.index, + self.time, + layer_u=layer_u, + layer_s=layer_s, + total_layers=layers, + return_ntr=self.return_ntr, + ) + elif self.model.lower().startswith("mixture"): + X, _, X_raw = prepare_data_deterministic( + self.subset_adata, + self.subset_adata.var.index, + self.time, + layers=layers, + total_layers=layers, + return_ntr=self.return_ntr, + ) - vel_N = (U_ - csr_matrix(Kc).multiply(U_)).multiply( - csr_matrix(gamma_ / Kc) - ) # vel.vel_u(U_) - # scale back to true velocity via multiplying "gamma_ / Kc". - vel_T = (U_ - csr_matrix(Kc).multiply(S_)).multiply(csr_matrix(gamma_ / Kc)) - elif experiment_type == "mix_std_stm": - # steady state RNA: u0, stimulation RNA: u_new; - # cell-wise transcription rate under simulation: alpha1 - u0, u_new, alpha1 = solve_alpha_2p_mat( - t0=np.max(t) - t, - t1=t, - alpha0=alpha[0], - beta=beta, - u1=U_, - ) + if self.model.lower() == "deterministic": + X = [X[i][[0, 1], :] for i in range(len(X))] + _param_ranges = { + "beta": [0, 1000], + "gamma": [0, 1000], + } + x0 = { + "u0": [0, 1000], + "s0": [0, 1000], + } + Est, _ = Estimation_DeterministicDeg, Deterministic + elif self.model.lower() == "stochastic": + _param_ranges = { + "beta": [0, 1000], + "gamma": [0, 1000], + } + x0 = { + "u0": [0, 1000], + "s0": [0, 1000], + "uu0": [0, 1000], + "ss0": [0, 1000], + "us0": [0, 1000], + } + Est, _ = Estimation_MomentDeg, Moments_NoSwitching + else: + raise NotImplementedError( + f"model {self.model} with kinetic assumption is not implemented. " + f"current supported models for degradation experiment include: " + f"stochastic, deterministic." + ) + else: + total_layer = "M_t" if ("M_t" in self.subset_adata.layers.keys() and self.data_type == "smoothed") else "X_total" - vel_U = alpha1 - csr_matrix(beta[:, None]).multiply(U) - vel_S = vel.vel_s(U, S) + layer = "M_n" if ("M_n" in self.subset_adata.layers.keys() and self.data_type == "smoothed") else "X_new" + X, X_raw = prepare_data_no_splicing( + self.subset_adata, + self.subset_adata.var.index, + self.time, + layer=layer, + total_layer=total_layer, + return_ntr=self.return_ntr, + ) - vel_N = alpha1 - csr_matrix(gamma[:, None]).multiply(u_new) - vel_T = alpha1 - csr_matrix(beta[:, None]).multiply(S_) + if self.model.lower() == "deterministic": + X = [X[i][0, :] for i in range(len(X))] + _param_ranges = { + "gamma": [0, 10], + } + x0 = {"u0": [0, 1000]} + Est, _ = ( + Estimation_DeterministicDegNosp, + Deterministic_NoSplicing, + ) + elif self.model.lower() == "stochastic": + _param_ranges = { + "gamma": [0, 10], + } + x0 = {"u0": [0, 1000], "uu0": [0, 1000]} + Est, _ = Estimation_MomentDegNosp, Moments_NoSwitchingNoSplicing + else: + raise NotImplementedError( + f"model {self.model} with kinetic assumption is not implemented. " + f"current supported models for degradation experiment include: " + f"stochastic, deterministic.") + _param_ranges = update_dict(_param_ranges, self.param_rngs) + x0_ = np.vstack([ran for ran in x0.values()]).T - else: - vel_U = vel.vel_u(U) - vel_S = vel.vel_s(U, S) - vel_N = vel.vel_u(U_) - vel_T = vel.vel_s(U_, S_ - U_) # need to consider splicing - else: - if experiment_type == "kin": - vel_U = np.nan - vel_S = np.nan + n_genes = self.subset_adata.n_vars + cost, logLL = np.zeros(n_genes), np.zeros(n_genes) + all_keys = list(_param_ranges.keys()) + list(x0.keys()) + all_keys = [cur_key for cur_key in all_keys if cur_key != "alpha_i"] + half_life, Estm = np.zeros(n_genes), [None] * n_genes + X_data, X_fit_data = [None] * n_genes, [None] * n_genes + if self.experiment_type: + popt = [None] * n_genes - Kc = np.clip(gamma[:, None], 0, 1 - 1e-3) # S - U slope - gamma_ = -(np.log(1 - Kc) / t[None, :]) # actual gamma - vel_N = (U_ - csr_matrix(Kc).multiply(U_)).multiply( - csr_matrix(gamma_ / Kc) - ) # vel.vel_u(U_) - # scale back to true velocity via multiplying "gamma_ / Kc". - vel_T = (U_ - csr_matrix(Kc).multiply(S_)).multiply(csr_matrix(gamma_ / Kc)) - elif experiment_type == "mix_std_stm": - vel_U = np.nan - vel_S = np.nan + main_debug("model: %s, experiment_type: %s" % (self.model, self.experiment_type)) + for i_gene in tqdm(range(n_genes), desc="estimating kinetic-parameters using kinetic model"): + if self.model.lower().startswith("mixture"): + estm = Est + if self.model.lower() == "mixture": + cur_X_data = np.vstack([X[i_layer][i_gene] for i_layer in range(len(X))]) + if issparse(X_raw[0]): + cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene].A for i_layer in range(len(X))]) + else: + cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene] for i_layer in range(len(X))]) + else: + cur_X_data = X[i_gene] + cur_X_raw = X_raw[i_gene] - # steady state RNA: u0, stimulation RNA: u_new; - # cell-wise transcription rate under simulation: alpha1 - u0, u_new, alpha1 = solve_alpha_2p_mat( - t0=np.max(t) - t, - t1=t, - alpha0=alpha[0], - beta=gamma, - u1=U_, - ) + if issparse(cur_X_raw[0, 0]): + cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) - vel_N = alpha1 - csr_matrix(gamma[:, None]).multiply(u_new) - vel_T = alpha1 - csr_matrix(gamma[:, None]).multiply(S_) - else: - vel_U = np.nan - vel_S = np.nan - vel_N = vel.vel_u(U_) - vel_T = vel.vel_u(S_) # don't consider splicing + _, cost[i_gene] = estm.auto_fit(np.unique(self.time), cur_X_data) + ( + model_1, + model_2, + kinetic_parameters, + mix_x0, + ) = estm.export_dictionary().values() + tmp = list(kinetic_parameters.values()) + tmp.extend(mix_x0) + Estm[i_gene] = tmp else: - vel_U = vel.vel_u(U) - vel_S = vel.vel_s(U, S) - vel_N, vel_T = np.nan, np.nan + estm = Est() + cur_X_data, cur_X_raw = X[i_gene], X_raw[i_gene] - vel_P = vel.vel_p(S, P) + _, cost[i_gene] = estm.auto_fit(np.unique(self.time), cur_X_data) + Estm[i_gene] = estm.export_parameters()[1:] - adata = set_velocity( - adata, - vel_U, - vel_S, - vel_N, - vel_T, - vel_P, - _group, - cur_grp, - cur_cells_bools, - valid_bools_, - ind_for_proteins, - ) + if issparse(cur_X_raw[0, 0]): + cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) + # model_1, kinetic_parameters, mix_x0 = estm.export_dictionary().values() + # tmp = list(kinetic_parameters.values()) + # tmp.extend(mix_x0) + # Estm[i_gene] = tmp - adata = set_param_ss( - adata, - est, - alpha, - beta, - gamma, - eta, - delta, - experiment_type, - _group, - cur_grp, - kin_param_pre, - valid_bools_, - ind_for_proteins, - ) + X_data[i_gene] = cur_X_data + if self.model.lower().startswith("mixture"): + X_fit_data[i_gene] = estm.simulator.x.T + X_fit_data[i_gene][estm.model1.n_species:] *= estm.scale + else: + if hasattr(estm, "extract_data_from_simulator"): + X_fit_data[i_gene] = estm.extract_data_from_simulator() + else: + X_fit_data[i_gene] = estm.simulator.x.T - elif assumption_mRNA.lower() == "kinetic": - return_ntr = True if fraction_for_deg and experiment_type.lower() == "deg" else False + half_life[i_gene] = estm.calc_half_life("gamma") - if model_was_auto and experiment_type.lower() == "kin": - model = "mixture" - if est_method == "auto": - est_method = "direct" - data_type = "smoothed" if use_smoothed else "sfs" + if self.model.lower().startswith("mixture"): + species = [0, 1, 2, 3] if self.has_splicing else [0, 1] + gof = GoodnessOfFit(estm.export_model(), params=estm.export_parameters()) + gof.prepare_data(self.time, cur_X_raw.T, species=species, normalize=True) + else: + gof = GoodnessOfFit( + estm.export_model(), + params=estm.export_parameters(), + x0=estm.simulator.x0, + ) + gof.prepare_data(self.time, cur_X_raw.T, normalize=True) - (params, half_life, cost, logLL, param_ranges, cur_X_data, cur_X_fit_data,) = kinetic_model( - subset_adata, - tkey, - model, - est_method, - experiment_type, - has_splicing, - splicing_labeling, - has_switch=True, - param_rngs={}, - data_type=data_type, - return_ntr=return_ntr, - **est_kwargs, + logLL[i_gene] = gof.calc_mean_squared_deviation() # .calc_gaussian_loglikelihood() + + if self.est_method == "twostep" and self.has_splicing: + layers = ["M_u", "M_s"] if ("M_u" in self.subset_adata.layers.keys() and self.data_type == "smoothed") else ["X_u", + "X_s"] + U, S = ( + self.subset_adata.layers[layers[0]].T, + self.subset_adata.layers[layers[1]].T, + ) + US, S2 = self.subset_adata.layers["M_us"].T, self.subset_adata.layers["M_ss"].T + # beta, beta_r2 = lin_reg_gamma_synthesis(U, Ul, time, perc_right=100) + gamma_k, gamma_b, gamma_all_r2, gamma_all_logLL = fit_slope_stochastic( + S, U, US, S2, perc_left=None, perc_right=5 ) - if type(params) == dict: - alpha = params.pop("alpha") - params = pd.DataFrame(params) - else: - alpha = params.loc[:, "alpha"].values if "alpha" in params.columns else None + Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) + Estm_df["gamma_k"] = gamma_k # gamma_k = gamma / beta + Estm_df["beta"] = Estm_df["gamma"] / gamma_k # gamma_k = gamma / beta + Estm_df["gamma_r2"] = gamma_all_r2 + else: + Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) - len_t, len_g = len(np.unique(t)), len(_group) - if cur_grp == _group[0]: - if len_g != 1: - # X_data, X_fit_data = np.zeros((len_g, adata.n_vars, len_t)), np.zeros((len_g, adata.n_vars,len_t)) - X_data, X_fit_data = [None] * len_g, [None] * len_g + return Estm_df, half_life, cost, logLL, _param_ranges, X_data, X_fit_data - if len(_group) == 1: - X_data, X_fit_data = cur_X_data, cur_X_fit_data - else: - # X_data[cur_grp_i, :, :], X_fit_data[cur_grp_i, :, :] = cur_X_data, cur_X_fit_data - X_data[cur_grp_i], X_fit_data[cur_grp_i] = ( - cur_X_data, - cur_X_fit_data, - ) + def fit_mix_kinetics(self): + total_layer = "M_t" if ("M_t" in self.subset_adata.layers.keys() and self.data_type == "smoothed") else "X_total" - a, b, alpha_a, alpha_i, beta, gamma = ( - params.loc[:, "a"].values if "a" in params.columns else None, - params.loc[:, "b"].values if "b" in params.columns else None, - params.loc[:, "alpha_a"].values if "alpha_a" in params.columns else None, - params.loc[:, "alpha_i"].values if "alpha_i" in params.columns else None, - params.loc[:, "beta"].values if "beta" in params.columns else None, - params.loc[:, "gamma"].values if "gamma" in params.columns else None, + if self.model.lower() in ["deterministic"]: + layer = "M_n" if ("M_n" in self.subset_adata.layers.keys() and self.data_type == "smoothed") else "X_new" + X, X_raw = prepare_data_no_splicing( + self.subset_adata, + self.subset_adata.var.index, + self.time, + layer=layer, + total_layer=total_layer, ) - if alpha is None: - alpha = fbar(a, b, alpha_a, 0) if alpha_i is None else fbar(a, b, alpha_a, alpha_i) - all_kinetic_params = [ - "a", - "b", - "alpha_a", - "alpha_i", - "alpha", - "beta", - "gamma", - ] - - extra_params = params.loc[:, params.columns.difference(all_kinetic_params)] - # if alpha = None, set alpha to be U; N - gamma R - params = {"alpha": alpha, "beta": beta, "gamma": gamma, "t": t} - vel = Velocity(**params) - # Fix below: - U, S = get_U_S_for_velocity_estimation( - subset_adata, - use_smoothed, - has_splicing, - has_labeling, - log_unnormalized, - NTR_vel, + if self.model.lower() == "deterministic": + X = [X[i][0, :] for i in range(len(X))] + _param_ranges = { + "alpha": [0, 1000], + "gamma": [0, 1000], + } + x0 = {"u0": [0, 1000]} + Est = Estimation_KineticChase + else: + raise NotImplementedError( + f"only `deterministic` model implemented for mix_pulse_chase/mix_kin_deg experiment!" ) + _param_ranges = update_dict(_param_ranges, self.param_rngs) + x0_ = np.vstack([ran for ran in x0.values()]).T - U_, S_ = get_U_S_for_velocity_estimation( - subset_adata, - use_smoothed, - has_splicing, - has_labeling, - log_unnormalized, - not NTR_vel, - ) + n_genes = self.subset_adata.n_vars + cost, logLL = np.zeros(n_genes), np.zeros(n_genes) + all_keys = list(_param_ranges.keys()) + list(x0.keys()) + all_keys = [cur_key for cur_key in all_keys if cur_key != "alpha_i"] + half_life, Estm = np.zeros(n_genes), [None] * n_genes + X_data, X_fit_data = [None] * n_genes, [None] * n_genes + if self.experiment_type: + popt = [None] * n_genes - # also get vel_N and vel_T - if NTR_vel: - if has_splicing: - if experiment_type == "kin": - vel_U = vel.vel_u(U_) - vel_S = vel.vel_s(U_, S_) - vel.parameters["beta"] = gamma - vel_N = vel.vel_u(U) - vel_T = vel.vel_u(S) # no need to consider splicing - elif experiment_type == "deg": - if splicing_labeling: - vel_U = np.nan - vel_S = vel.vel_s(U_, S_) - vel_N = np.nan - vel_T = np.nan - else: - vel_U = np.nan - vel_S = vel.vel_s(U_, S_) - vel_N = np.nan - vel_T = np.nan - elif experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: - vel_U = vel.vel_u(U_, repeat=True) - vel_S = vel.vel_s(U_, S_) - vel.parameters["beta"] = gamma - vel_N = vel.vel_u(U, repeat=True) - vel_T = vel.vel_u(S, repeat=True) # no need to consider splicing + main_debug("model: %s, experiment_type: %s" % (self.model, self.experiment_type)) + for i_gene in tqdm(range(n_genes), desc="estimating kinetic-parameters using kinetic model"): + if self.model.lower().startswith("mixture"): + estm = Est + if self.model.lower() == "mixture": + cur_X_data = np.vstack([X[i_layer][i_gene] for i_layer in range(len(X))]) + if issparse(X_raw[0]): + cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene].A for i_layer in range(len(X))]) + else: + cur_X_raw = np.hstack([X_raw[i_layer][:, i_gene] for i_layer in range(len(X))]) else: - if experiment_type == "kin": - vel_U = np.nan - vel_S = np.nan - - # calculate cell-wise alpha, if est_method is twostep, this can be skipped - alpha_ = one_shot_alpha_matrix(U, gamma, t) + cur_X_data = X[i_gene] + cur_X_raw = X_raw[i_gene] - vel.parameters["alpha"] = alpha_ + if issparse(cur_X_raw[0, 0]): + cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) - vel_N = vel.vel_u(U) - vel_T = vel.vel_u(S) # don't consider splicing - elif experiment_type == "deg": - vel_U = np.nan - vel_S = np.nan - vel_N = np.nan - vel_T = np.nan - elif experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: - vel_U = np.nan - vel_S = np.nan - vel_N = vel.vel_u(U, repeat=True) - vel_T = vel.vel_u(S) # don't consider splicing + _, cost[i_gene] = estm.auto_fit(np.unique(self.time), cur_X_data) + ( + model_1, + model_2, + kinetic_parameters, + mix_x0, + ) = estm.export_dictionary().values() + tmp = list(kinetic_parameters.values()) + tmp.extend(mix_x0) + Estm[i_gene] = tmp else: - if has_splicing: - if experiment_type == "kin": - vel_U = vel.vel_u(U) - vel_S = vel.vel_s(U, S) - vel.parameters["beta"] = gamma - vel_N = vel.vel_u(U_) - vel_T = vel.vel_u(S_) # no need to consider splicing - elif experiment_type == "deg": - if splicing_labeling: - vel_U = np.nan - vel_S = vel.vel_s(U, S) - vel_N = np.nan - vel_T = np.nan - else: - vel_U = np.nan - vel_S = vel.vel_s(U, S) - vel_N = np.nan - vel_T = np.nan - elif experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: - vel_U = vel.vel_u(U, repeat=True) - vel_S = vel.vel_s(U, S) - vel.parameters["beta"] = gamma - vel_N = vel.vel_u(U_, repeat=True) - vel_T = vel.vel_u(S_, repeat=True) # no need to consider splicing - else: - if experiment_type == "kin": - vel_U = np.nan - vel_S = np.nan - - # calculate cell-wise alpha, if est_method is twostep, this can be skipped - alpha_ = one_shot_alpha_matrix(U_, gamma, t) + estm = Est() + cur_X_data, cur_X_raw = X[i_gene], X_raw[i_gene] - vel.parameters["alpha"] = alpha_ + popt[i_gene], cost[i_gene] = estm.auto_fit(np.unique(self.time), cur_X_data) + Estm[i_gene] = estm.export_parameters() - vel_N = vel.vel_u(U_) - vel_T = vel.vel_u(S_) # need to consider splicing - elif experiment_type == "deg": - vel_U = np.nan - vel_S = np.nan - vel_N = np.nan - vel_T = np.nan - elif experiment_type in ["mix_kin_deg", "mix_pulse_chase"]: - vel_U = np.nan - vel_S = np.nan - vel_N = vel.vel_u(U_, repeat=True) - vel_T = vel.vel_u(S_, repeat=True) # don't consider splicing + if issparse(cur_X_raw[0, 0]): + cur_X_raw = np.hstack((cur_X_raw[0, 0].A, cur_X_raw[1, 0].A)) + # model_1, kinetic_parameters, mix_x0 = estm.export_dictionary().values() + # tmp = list(kinetic_parameters.values()) + # tmp.extend(mix_x0) + # Estm[i_gene] = tmp - vel_P = vel.vel_p(S, P) + X_data[i_gene] = cur_X_data + if self.model.lower().startswith("mixture"): + X_fit_data[i_gene] = estm.simulator.x.T + X_fit_data[i_gene][estm.model1.n_species:] *= estm.scale + else: + # kinetic chase simulation + kinetic_chase = estm.simulator.x.T + # hidden x + tt, h = estm.simulator.calc_init_conc() - adata = set_velocity( - adata, - vel_U, - vel_S, - vel_N, - vel_T, - vel_P, - _group, - cur_grp, - cur_cells_bools, - valid_bools_, - ind_for_proteins, - ) + X_fit_data[i_gene] = [kinetic_chase, [tt, h]] - adata = set_param_kinetic( - adata, - alpha, - a, - b, - alpha_a, - alpha_i, - beta, - gamma, - cost, - logLL, - kin_param_pre, - extra_params, - _group, - cur_grp, - cur_cells_bools, - valid_bools_, - ) - # add protein related parameters in the moment model below: - elif model.lower() == "model_selection": - main_warning("Not implemented yet.") + half_life[i_gene] = estm.calc_half_life("gamma") - if group is not None and group in adata.obs[group]: - uns_key = group + "_dynamics" - else: - uns_key = "dynamics" + if self.model.lower().startswith("mixture"): + species = [0, 1, 2, 3] if self.has_splicing else [0, 1] + gof = GoodnessOfFit(estm.export_model(), params=estm.export_parameters()) + gof.prepare_data(self.time, cur_X_raw.T, species=species, normalize=True) + else: + gof = GoodnessOfFit( + estm.export_model(), + params=estm.export_parameters(), + x0=estm.simulator.x0, + ) + gof.prepare_data(self.time, cur_X_raw.T, normalize=True) - if sanity_check and experiment_type in ["kin", "deg"]: - sanity_check_cols = adata.var.columns.str.endswith("sanity_check") - adata.var["use_for_dynamics"] = adata.var.loc[:, sanity_check_cols].sum(1).astype(bool) - else: - adata.var["use_for_dynamics"] = False - adata.var.loc[valid_bools, "use_for_dynamics"] = True + logLL[i_gene] = gof.calc_mean_squared_deviation() # .calc_gaussian_loglikelihood() - adata.uns[uns_key] = { - "filter_gene_mode": filter_gene_mode, - "t": t, - "group": group, - "X_data": X_data, - "X_fit_data": X_fit_data, - "asspt_mRNA": assumption_mRNA, - "experiment_type": experiment_type, - "normalized": normalized, - "model": model, - "est_method": est_method, - "has_splicing": has_splicing, - "has_labeling": has_labeling, - "splicing_labeling": splicing_labeling, - "has_protein": has_protein, - "use_smoothed": use_smoothed, - "NTR_vel": NTR_vel, - "log_unnormalized": log_unnormalized, - "fraction_for_deg": fraction_for_deg, - } + if self.est_method == "twostep": + if self.has_splicing: + layers = ( + ["M_u", "M_s"] if ("M_u" in self.subset_adata.layers.keys() and self.data_type == "smoothed") else ["X_u", + "X_s"] + ) + U, S = ( + self.subset_adata.layers[layers[0]].T, + self.subset_adata.layers[layers[1]].T, + ) + US, S2 = ( + self.subset_adata.layers["M_us"].T, + self.subset_adata.layers["M_ss"].T, + ) + # beta, beta_r2 = lin_reg_gamma_synthesis(U, Ul, time, perc_right=100) + ( + gamma_k, + gamma_b, + gamma_all_r2, + gamma_all_logLL, + ) = fit_slope_stochastic(S, U, US, S2, perc_left=None, perc_right=5) - if del_2nd_moments: - remove_2nd_moments(adata) + Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) + Estm_df["gamma_k"] = gamma_k # gamma_k = gamma / beta + Estm_df["beta"] = Estm_df["gamma"] / gamma_k # gamma_k = gamma / beta + Estm_df["gamma_r2"] = gamma_all_r2 + else: + Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) + Estm_df["gamma_k"] = Estm_df["gamma"] # fix a bug in pl.dynamics + else: + Estm_df = pd.DataFrame(np.vstack(Estm), columns=[*all_keys[: len(Estm[0])]]) - return adata + return Estm_df, half_life, cost, logLL, _param_ranges, X_data, X_fit_data def kinetic_model( From b661403e8e8dc9623443c61a3088355ab3d75259 Mon Sep 17 00:00:00 2001 From: sichao Date: Wed, 26 Jul 2023 15:49:15 -0400 Subject: [PATCH 29/31] call refactored params estimation --- dynamo/tools/dynamics.py | 98 ++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 39 deletions(-) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 4cc08fd09..7d9984fad 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -304,8 +304,14 @@ def __init__(self, dynamics_kwargs: Dict): self.tkey = self.adata.uns["pp"]["tkey"] if dynamics_kwargs["tkey"] is None else dynamics_kwargs["tkey"] self.est_kwargs = dynamics_kwargs["est_kwargs"] - def estimate_params_utils(self, params_est_kwargs): - pass + def estimate_params_utils(self, fit_kwargs=None, **kwargs): + self.est = ss_estimation(**kwargs) + if self.model.lower() == "deterministic": + self.est.fit_conventional_deterministic(**fit_kwargs) + elif self.model.lower() == "stochastic": + self.est.fit_conventional_stochastic(**fit_kwargs) + else: + raise NotImplementedError("Method not implemented.") def estimate_params_ss(self, subset_adata: AnnData, **est_params_args): """Estimate velocity parameters with steady state mRNA assumption.""" @@ -329,7 +335,8 @@ def estimate_params_ss(self, subset_adata: AnnData, **est_params_args): self.TotalCounts = None self.NewSmoothCSP = None - self.est = ss_estimation( + self.estimate_params_utils( + fit_kwargs=self.est_kwargs, U=self.U.copy() if self.U is not None else None, Ul=self.Ul.copy() if self.Ul is not None else None, S=self.S.copy() if self.S is not None else None, @@ -351,20 +358,7 @@ def estimate_params_ss(self, subset_adata: AnnData, **est_params_args): concat_data=self.concat_data, cores=self.cores, **ss_estimation_kwargs, - ) # U: (unlabeled) unspliced; S: (unlabeled) spliced; U / Ul: old and labeled; U, Ul, S, Sl: uu/ul/su/sl - - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - - if self.experiment_type.lower() in ["one-shot", "one_shot"]: - if self.one_shot_method == "storm-csp": - self.est.fit(one_shot_method=self.one_shot_method, perc_right=50, **self.est_kwargs) - else: - self.est.fit(one_shot_method=self.one_shot_method, **self.est_kwargs) - else: - # experiment_type can be `kin` also and by default use - # conventional method to estimate k but correct for time - self.est.fit(**self.est_kwargs) + ) self.alpha, self.beta, self.gamma, self.eta, self.delta = self.est.parameters.values() @@ -378,14 +372,15 @@ def estimate_params_kin(self, cur_grp_i: int, cur_grp: str, subset_adata: AnnDat self.est_method = "direct" data_type = "smoothed" if self.use_smoothed else "sfs" - (params, half_life, self.cost, self.logLL, param_ranges, cur_X_data, cur_X_fit_data,) = kinetic_model( - subset_adata, - self.tkey, - self.model, - self.est_method, - self.experiment_type, - self.has_splicing, - self.splicing_labeling, + (params, half_life, self.cost, self.logLL, param_ranges, cur_X_data, cur_X_fit_data,) = self.estimate_params_utils( + fit_kwargs=self.est_kwargs, + subset_adata=subset_adata, + tkey=self.tkey, + model=self.model, + est_method=self.est_method, + experiment_type=self.experiment_type, + has_splicing=self.has_splicing, + splicing_labeling=self.splicing_labeling, has_switch=True, param_rngs={}, data_type=data_type, @@ -892,6 +887,14 @@ def calculate_vels( class OneShotDynamics(LabeledDynamics): """Dynamics model for the one shot experiment, where there is only one labeling time point.""" + def estimate_params_utils(self, fit_kwargs=None, **kwargs): + self.est = ss_estimation(**kwargs) + if self.experiment_type.lower() in ["one-shot", "one_shot"]: + if self.one_shot_method == "storm-csp": + self.est.fit_oneshot(one_shot_method=self.one_shot_method, perc_right=50, **fit_kwargs) + else: + self.est.fit_oneshot(one_shot_method=self.one_shot_method, **fit_kwargs) + def calculate_vel_U( self, vel: Velocity, @@ -1063,8 +1066,9 @@ def calculate_vels( class TwoStepKineticsDynamics(KineticsDynamics): - def estimate_params_utils(self, params_est_kwargs): - pass + def estimate_params_utils(self, fit_kwargs=None, **kwargs): + kin_estimation = KineticEstimation(**kwargs) + return kin_estimation.fit_twostep_kinetics(**fit_kwargs) class KineticsStormDynamics(LabeledDynamics): @@ -1072,8 +1076,9 @@ class KineticsStormDynamics(LabeledDynamics): models. In Model 1, only transcription and mRNA degradation were considered. In Model 2, we considered transcription, splicing, and spliced mRNA degradation. And in Model 3, we considered the switching of gene expression states, transcription in the active state, and mRNA degradation.""" - def estimate_params_utils(self, params_est_kwargs): - pass + def estimate_params_utils(self, fit_kwargs=None, **kwargs): + kin_estimation = KineticEstimation(**kwargs) + return kin_estimation.fit_storm(**fit_kwargs) def calculate_vel_U( self, @@ -1142,14 +1147,19 @@ def calculate_vels( class DirectKineticsDynamics(KineticsDynamics): - def estimate_params_utils(self, params_est_kwargs): - pass + def estimate_params_utils(self, fit_kwargs=None, **kwargs): + kin_estimation = KineticEstimation(**kwargs) + return kin_estimation.fit_direct_kinetics(**fit_kwargs) class DegradationDynamics(LabeledDynamics): """Dynamics model for the degradation experiment. In degradation experiment, samples are chased after an extended 4sU (or other nucleotide analog) labeling period and the wash-out to observe the decay of the abundance of the (labeled) unspliced and spliced RNA decay over time.""" + def estimate_params_utils(self, fit_kwargs=None, **kwargs): + kin_estimation = KineticEstimation(**kwargs) + return kin_estimation.fit_degradation(**fit_kwargs) + def calculate_vel_U( self, vel: Velocity, @@ -1193,6 +1203,10 @@ def calculate_vel_T( class MixStdStmDynamics(LabeledDynamics): """Dynamics model for the mixed steady state and stimulation labeling (mix_std_stm) experiment.""" + def estimate_params_utils(self, fit_kwargs=None, **kwargs): + self.est = ss_estimation(**kwargs) + self.est.fit_mix_std_stm(**fit_kwargs) + def calculate_vel_U( self, vel: Velocity, @@ -1268,6 +1282,10 @@ def calculate_vels( class MixKineticsDynamics(LabeledDynamics): """Dynamics model for two mix experiment type: mix_kin_deg and mix_pulse_chase.""" + def estimate_params_utils(self, fit_kwargs=None, **kwargs): + kin_estimation = KineticEstimation(**kwargs) + return kin_estimation.fit_mix_kinetics(**fit_kwargs) + def calculate_vel_U( self, vel: Velocity, @@ -1444,9 +1462,11 @@ def dynamics_wrapper( if assumption_mRNA == "ss": estimator = SSKineticsDynamics(dynamics_kwargs) elif assumption_mRNA == "kinetic": - if model == 'deterministic': - estimator = KineticsDynamics(dynamics_kwargs) - elif model == 'stochastic': + if est_method == 'twostep': + estimator = TwoStepKineticsDynamics(dynamics_kwargs) + elif est_method == "direct": + estimator = DirectKineticsDynamics(dynamics_kwargs) + elif "storm" in est_method: estimator = KineticsStormDynamics(dynamics_kwargs) else: raise NotImplementedError("This method has not been implemented.") @@ -2400,7 +2420,7 @@ def __init__( return_ntr: bool = False, **est_kwargs, ): - self.subset_data = subset_adata + self.subset_adata = subset_adata self.tkey = tkey self.model = model self.est_method = est_method @@ -2608,7 +2628,7 @@ def fit_storm(self): self.subset_adata.layers[layers_smoothed[0]].T, self.subset_adata.layers[layers_smoothed[1]].T, ) - (gamma_init, _, _, _, _,) = lin_reg_gamma_synthesis(Total_smoothed, New_smoothed, time, + (gamma_init, _, _, _, _,) = lin_reg_gamma_synthesis(Total_smoothed, New_smoothed, self.time, perc_right=5) # Read raw counts @@ -2629,7 +2649,7 @@ def fit_storm(self): cell_total = self.subset_adata.obs['initial_cell_size'].astype("float").values if "storm-csp" == self.est_method: - gamma, gamma_r2, gamma_r2_raw, alpha = storm.mle_cell_specific_poisson(New_raw, time, + gamma, gamma_r2, gamma_r2_raw, alpha = storm.mle_cell_specific_poisson(New_raw, self.time, gamma_init, cell_total) elif "storm-cszip" == self.est_method: gamma, prob_off, gamma_r2, gamma_r2_raw, alpha = storm.mle_cell_specific_zero_inflated_poisson( @@ -2986,9 +3006,9 @@ def fit_direct_kinetics(self): alpha0 = guestimate_alpha(np.sum(cur_X_data, 0), np.unique(self.time)) else: alpha0 = ( - guestimate_alpha(cur_X_data, np.unique(time)) + guestimate_alpha(cur_X_data, np.unique(self.time)) if cur_X_data.ndim == 1 - else guestimate_alpha(cur_X_data[0], np.unique(time)) + else guestimate_alpha(cur_X_data[0], np.unique(self.time)) ) if self.model.lower() == "stochastic": From 5b1267b5d4093cdae80f60af26ee2a2fd192709f Mon Sep 17 00:00:00 2001 From: Sichao25 Date: Thu, 27 Jul 2023 16:52:01 -0400 Subject: [PATCH 30/31] create docstr --- dynamo/estimation/csc/velocity.py | 7 ++++ dynamo/tools/dynamics.py | 57 ++++++++++++++++++++++++++++++- 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/dynamo/estimation/csc/velocity.py b/dynamo/estimation/csc/velocity.py index 85229499c..d1bfc8c8c 100755 --- a/dynamo/estimation/csc/velocity.py +++ b/dynamo/estimation/csc/velocity.py @@ -1604,6 +1604,7 @@ def fit( ) = (delta, delta_intercept, delta_r2, delta_logLL) def fit_protein(self, intercept, perc_left, perc_right, cores): + """Fit the input data to estimate parameters for protein.""" if np.all(self._exist_data("p", "su")): ind_for_proteins = self.ind_for_proteins n_genes = len(ind_for_proteins) if ind_for_proteins is not None else 0 @@ -1672,6 +1673,8 @@ def fit_conventional_deterministic( perc_left=None, perc_right=5, ): + """Fit the input data to estimate parameters for conventional experiment type and steady-state kinetics + experiment type with deterministic model.""" n_genes = self.get_n_genes() cores = max(1, int(self.cores)) if np.all(self._exist_data("uu", "su")): @@ -1790,6 +1793,8 @@ def fit_conventional_stochastic( perc_left=None, perc_right=5, ): + """Fit the input data to estimate parameters for conventional experiment type and steady-state kinetics + experiment type with stochastic model.""" n_genes = self.get_n_genes() cores = max(1, int(self.cores)) if np.all(self._exist_data("uu", "su")): @@ -1976,6 +1981,7 @@ def fit_oneshot( clusters=None, one_shot_method="combined", ): + """Fit the input data to estimate parameters for one-shot experiment type.""" n_genes = self.get_n_genes() cores = max(1, int(self.cores)) if len(np.unique(self.t)) > 1: @@ -2570,6 +2576,7 @@ def fit_mix_std_stm( perc_left=None, perc_right=5, ): + """Fit the input data to estimate parameters for mix_std_stm experiment type.""" n_genes = self.get_n_genes() cores = max(1, int(self.cores)) t_min, t_max = np.min(self.t), np.max(self.t) diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py index 7d9984fad..dd771459c 100755 --- a/dynamo/tools/dynamics.py +++ b/dynamo/tools/dynamics.py @@ -305,6 +305,7 @@ def __init__(self, dynamics_kwargs: Dict): self.est_kwargs = dynamics_kwargs["est_kwargs"] def estimate_params_utils(self, fit_kwargs=None, **kwargs): + """Default method to estimate the velocity parameters.""" self.est = ss_estimation(**kwargs) if self.model.lower() == "deterministic": self.est.fit_conventional_deterministic(**fit_kwargs) @@ -363,7 +364,7 @@ def estimate_params_ss(self, subset_adata: AnnData, **est_params_args): self.alpha, self.beta, self.gamma, self.eta, self.delta = self.est.parameters.values() def estimate_params_kin(self, cur_grp_i: int, cur_grp: str, subset_adata: AnnData, **est_params_args): - """Estimate velocity parameters with kinetic mRNA assumption.""" + """Estimate velocity parameters with kinetic mRNA assumption. Will be overriden in the subclass.""" return_ntr = True if self.fraction_for_deg and self.experiment_type.lower() == "deg" else False if self.model_was_auto and self.experiment_type.lower() == "kin": @@ -1066,6 +1067,7 @@ def calculate_vels( class TwoStepKineticsDynamics(KineticsDynamics): + """Dynamic models for the kinetic experiment with two-step method.""" def estimate_params_utils(self, fit_kwargs=None, **kwargs): kin_estimation = KineticEstimation(**kwargs) return kin_estimation.fit_twostep_kinetics(**fit_kwargs) @@ -1147,6 +1149,7 @@ def calculate_vels( class DirectKineticsDynamics(KineticsDynamics): + """Dynamic models for the kinetic experiment with direct method.""" def estimate_params_utils(self, fit_kwargs=None, **kwargs): kin_estimation = KineticEstimation(**kwargs) return kin_estimation.fit_direct_kinetics(**fit_kwargs) @@ -2405,6 +2408,7 @@ def dynamics( class KineticEstimation: + """The clss to estimate the parameters required for velocity estimation when the mRNA assumption is 'kinetic'.""" def __init__( self, subset_adata: AnnData, @@ -2420,6 +2424,52 @@ def __init__( return_ntr: bool = False, **est_kwargs, ): + """Constructor. + + Args: + subset_adata: an AnnData object with invalid genes trimmed. + tkey: the column key for the labeling time of cells in .obs. Used for labeling based scRNA-seq data. If `tkey` + is None, then `adata.uns["pp"]["tkey"]` will be checked and used if exists. + model: String indicates which estimation model will be used. + Available options are: + (1) 'deterministic': The method based on `deterministic` ordinary differential equations; + (2) 'stochastic' or `moment`: The new method from us that is based on `stochastic` master equations; + Note that `kinetic` model doesn't need to assume the `experiment_type` is not `conventional`. As other + labeling experiments, if you specify the `tkey`, dynamo can also apply `kinetic` model on `conventional` + scRNA-seq datasets. A "model_selection" model will be supported soon in which alpha, beta and gamma will be + modeled as a function of time. + est_method: Available options when the `assumption_mRNA` is 'kinetic' include: + (1) 'auto': dynamo will choose the suitable estimation method based on the `assumption_mRNA`, + `experiment_type` and `model` parameter. + (2) `twostep`: first for each time point, estimate K (1-e^{-rt}) using the total and new RNA data. Then + use regression via t-np.log(1-K) to get degradation rate gamma. When splicing and labeling data both + exist, replacing new/total with ul/u can be used to estimate beta. Suitable for velocity estimation. + (3) `direct` (default): method that directly uses the kinetic model to estimate rate parameters, + generally not good for velocity estimation. + Under `kinetic` model, choosing estimation is `experiment_type` dependent. For `kinetics` experiments, + dynamo supposes methods including RNA bursting or without RNA bursting. Dynamo also adaptively estimates + parameters, based on whether the data has splicing or without splicing. + Under `kinetic` assumption, the above method uses non-linear least square fitting. In order to return + estimated parameters (including RNA half-life), it additionally returns the log-likelihood of the + fitting, which will be used for transition matrix and velocity embedding. + All `est_method` uses least square to estimate optimal parameters with latin cubic sampler for initial + sampling. + experiment_type: the experiment type of the data. + has_splicing: whether the object containing unspliced and spliced data + splicing_labeling: hether the object containing both splicing and labelling data + has_switch: whether there should be switch for stochastic model. + param_rngs: the range set for each parameter. + data_type: the data type, could be "smoothed" or "sfs". Defaults to "sfs". + return_ntr: whether to deal with new/total ratio. Defaults to False. + est_kwargs: additional keyword arguments of fitting function. + + Returns: + A tuple (Estm_df, half_life, cost, logLL, _param_ranges, X_data, X_fit_data), where Estm_df contains the + parameters required for mRNA velocity calculation, half_life is for half-life of spliced mRNA, cost is for the + cost of kinetic parameters estimation, logLL is for loglikelihood of kinetic parameters estimation, + _param_ranges is for the intended range of parameter estimation, X_data is for the data used for parameter + estimation, and X_fit_data is for the data that get fitted during parameter estimation. + """ self.subset_adata = subset_adata self.tkey = tkey self.model = model @@ -2435,6 +2485,7 @@ def __init__( self.time = subset_adata.obs[tkey].astype("float").values def fit_twostep_kinetics(self): + """Fit the input data to estimate parameters for kinetics experiment type with two-step method.""" if self.has_splicing: layers = ( ["M_u", "M_s", "M_t", "M_n"] @@ -2543,6 +2594,7 @@ def fit_twostep_kinetics(self): X_fit_data, ) def fit_storm(self): + """Fit the input data to estimate parameters for kinetics experiment type with storm method.""" if self.has_splicing: # Initialization based on the steady-state assumption layers_smoothed = ["M_u", "M_s", "M_t", "M_n"] @@ -2690,6 +2742,7 @@ def fit_storm(self): ) def fit_direct_kinetics(self): + """Fit the input data to estimate parameters for kinetics experiment type with direct method.""" if self.has_splicing and self.splicing_labeling: layers = ( ["M_ul", "M_sl", "M_uu", "M_su"] @@ -3060,6 +3113,7 @@ def fit_direct_kinetics(self): return Estm_df, half_life, cost, logLL, _param_ranges, X_data, X_fit_data def fit_degradation(self): + """Fit the input data to estimate parameters for degradation experiment type.""" if self.has_splicing and self.splicing_labeling: layers = ( ["M_ul", "M_sl", "M_uu", "M_su"] @@ -3256,6 +3310,7 @@ def fit_degradation(self): return Estm_df, half_life, cost, logLL, _param_ranges, X_data, X_fit_data def fit_mix_kinetics(self): + """Fit the input data to estimate parameters for mix_kinetics_degradation experiment type.""" total_layer = "M_t" if ("M_t" in self.subset_adata.layers.keys() and self.data_type == "smoothed") else "X_total" if self.model.lower() in ["deterministic"]: From 7ac035c9943386667848fb2ac5ac721507a662a2 Mon Sep 17 00:00:00 2001 From: sichao Date: Tue, 22 Aug 2023 16:13:13 -0400 Subject: [PATCH 31/31] delete tutorial script --- scEU-seq_CellCycle_ICSP.py | 213 ------------------------------------- 1 file changed, 213 deletions(-) delete mode 100644 scEU-seq_CellCycle_ICSP.py diff --git a/scEU-seq_CellCycle_ICSP.py b/scEU-seq_CellCycle_ICSP.py deleted file mode 100644 index d0f26a4eb..000000000 --- a/scEU-seq_CellCycle_ICSP.py +++ /dev/null @@ -1,213 +0,0 @@ -#!/usr/bin/env python -# -*- coding:utf-8 -*- - -import warnings - -warnings.filterwarnings('ignore') - -import dynamo as dyn - -filename = './data/rpe1.h5ad' - -rpe1 = dyn.read(filename) - -dyn.convert2float(rpe1, ['Cell_cycle_possition', 'Cell_cycle_relativePos']) - -rpe1.obs.exp_type.value_counts() - -rpe1[rpe1.obs.exp_type == 'Chase', :].obs.time.value_counts() - -rpe1[rpe1.obs.exp_type == 'Pulse', :].obs.time.value_counts() - -rpe1_kinetics = rpe1[rpe1.obs.exp_type == 'Pulse', :] -rpe1_kinetics.obs['time'] = rpe1_kinetics.obs['time'].astype(str) -rpe1_kinetics.obs.loc[rpe1_kinetics.obs['time'] == 'dmso', 'time'] = -1 -rpe1_kinetics.obs['time'] = rpe1_kinetics.obs['time'].astype(float) -rpe1_kinetics = rpe1_kinetics[rpe1_kinetics.obs.time != -1, :] - -rpe1_genes = ['UNG', 'PCNA', 'PLK1', 'HPRT1'] - -rpe1_kinetics.obs.time = rpe1_kinetics.obs.time.astype('float') -rpe1_kinetics.obs.time = rpe1_kinetics.obs.time / 60 # convert minutes to hours - -print(rpe1_kinetics.obs.time.value_counts()) - -# from dynamo.tools.recipes import recipe_kin_data -# # velocity -# recipe_kin_data(adata=rpe1_kinetics, -# keep_filtered_genes=True, -# keep_raw_layers=True, -# del_2nd_moments=True, -# tkey='time', -# n_top_genes=1000, -# # est_method='twostep', -# ) - -from dynamo.tools.dynamics import dynamics_wrapper -from dynamo.tools.dimension_reduction import reduceDimension -from dynamo.tools.cell_velocities import cell_velocities -from dynamo.preprocessing.utils import ( - del_raw_layers, - detect_experiment_datatype, - reset_adata_X, - collapse_species_adata -) -from dynamo.preprocessing import Preprocessor -from dynamo.tools.moments import moments -from dynamo.preprocessing.pca import pca -from dynamo.tools.connectivity import neighbors,normalize_knn_graph -import numpy as np - - -keep_filtered_cells = False -keep_filtered_genes = False -keep_raw_layers = True -del_2nd_moments = True -has_splicing, has_labeling, splicing_labeling = True, True, True -if has_splicing and has_labeling and splicing_labeling: - layers = ["X_new", "X_total", "X_uu", "X_ul", "X_su", "X_sl"] -elif has_labeling: - layers = ["X_new", "X_total"] - -# Preprocessing -preprocessor = Preprocessor(cell_cycle_score_enable=True) -preprocessor.config_monocle_recipe(rpe1_kinetics, n_top_genes=1000) -preprocessor.size_factor_kwargs.update( - { - "X_total_layers": False, - "splicing_total_layers": False, - } -) -preprocessor.normalize_by_cells_function_kwargs.update( - { - "X_total_layers": False, - "splicing_total_layers": False, - "keep_filtered": keep_filtered_genes, - "total_szfactor": "total_Size_Factor", - } -) -preprocessor.filter_cells_by_outliers_kwargs["keep_filtered"] = keep_filtered_cells -preprocessor.select_genes_kwargs["keep_filtered"] = keep_filtered_genes - -rpe1_kinetics = collapse_species_adata(rpe1_kinetics) -if True: - reset_adata_X(rpe1_kinetics, experiment_type="kin", has_labeling=has_labeling, has_splicing=has_splicing) -preprocessor.preprocess_adata_monocle(adata=rpe1_kinetics, tkey='time', experiment_type="kin") -if not keep_raw_layers: - del_raw_layers(rpe1_kinetics) - -tkey = rpe1_kinetics.uns["pp"]["tkey"] -# first calculate moments for labeling data relevant layers using total based connectivity graph -moments(rpe1_kinetics, group=tkey, layers=layers) - -# then we want to calculate moments for spliced and unspliced layers based on connectivity graph from spliced -# data. -# first get X_spliced based pca embedding -CM = np.log1p(rpe1_kinetics[:, rpe1_kinetics.var.use_for_pca].layers["X_spliced"].A) -cm_genesums = CM.sum(axis=0) -valid_ind = np.logical_and(np.isfinite(cm_genesums), cm_genesums != 0) -valid_ind = np.array(valid_ind).flatten() - -pca(rpe1_kinetics, CM[:, valid_ind], pca_key="X_spliced_pca") -# then get neighbors graph based on X_spliced_pca -neighbors(rpe1_kinetics, X_data=rpe1_kinetics.obsm["X_spliced_pca"], layer="X_spliced") -# then normalize neighbors graph so that each row sums up to be 1 -conn = normalize_knn_graph(rpe1_kinetics.obsp["connectivities"] > 0) -# then calculate moments for spliced related layers using spliced based connectivity graph -moments(rpe1_kinetics, conn=conn, layers=["X_spliced", "X_unspliced"]) -# then perform kinetic estimations with properly preprocessed layers for either the labeling or the splicing -# data -moments(rpe1_kinetics, conn=conn, layers=["uu", "ul", "su", "sl", "new", "total"]) - -dynamics_wrapper(rpe1_kinetics, model="stochastic", est_method="storm-icsp", del_2nd_moments=del_2nd_moments) -reduceDimension(rpe1_kinetics, reduction_method='umap') -cell_velocities(rpe1_kinetics, basis='umap') - -rpe1_kinetics.obsm['X_RFP_GFP'] = rpe1_kinetics.obs.loc[:, - ['RFP_log10_corrected', 'GFP_log10_corrected']].values.astype('float') - -# total velocity -dyn.tl.reduceDimension(rpe1_kinetics, reduction_method='umap') -dyn.tl.cell_velocities(rpe1_kinetics, enforce=True, vkey='velocity_T', ekey='M_t', basis='RFP_GFP') -dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], basis='RFP_GFP') - -# spliced RNA velocity -dyn.tl.reduceDimension(rpe1_kinetics, reduction_method='umap') -dyn.tl.cell_velocities(rpe1_kinetics, enforce=True, vkey='velocity_S', ekey='M_s', basis='RFP_GFP') -dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], basis='RFP_GFP') - -# # for velocity gene-wise parameters -# import matplotlib.pyplot as plt -# import scanpy as sc -# sc.set_figure_params(scanpy=True, fontsize=6) -# plt.rcParams['font.size'] = '6' -# dyn.configuration.set_figure_params(dpi_save=600, figsize=(17 / 3 / 2.54, 17 / 3 / 2.54 * (4 / 6))) -# -# save_path = './cell_wise_figures/' -# dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], vkey='velocity_T', ekey='M_t', basis='RFP_GFP', -# save_show_or_return='show', -# save_kwargs={"path": save_path, "prefix": 'icsp_vt_stream_gene-wise_alpha_beta', "dpi": 600, 'ext':'png'}) -# -# -# # spliced RNA velocity -# dyn.tl.reduceDimension(rpe1_kinetics, reduction_method='umap') -# dyn.tl.cell_velocities(rpe1_kinetics, enforce=True, vkey='velocity_S', ekey='M_s', basis='RFP_GFP') -# dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], basis='RFP_GFP') -# -# # for velocity gene-wise parameters -# save_path = './cell_wise_figures/' -# dyn.pl.streamline_plot(rpe1_kinetics, color=['cell_cycle_phase'], vkey='velocity_S', ekey='M_s', basis='RFP_GFP', -# save_show_or_return='show', -# save_kwargs={"path": save_path, "prefix": 'icsp_vs_stream_gene-wise_alpha_beta', "dpi": 600, 'ext':'png'}) -# -# -# import scvelo as scv -# import matplotlib.pyplot as plt -# -# plt.rcParams['font.size'] = '7' -# dpi = 600 -# figsize = (6, 3) -# -# well_fitted = rpe1_kinetics.var['gamma_r2'] > 0 -# well_fitted_genes = well_fitted[well_fitted].index -# # well_fitted_genes = rpe1_kinetics.var['gamma_r2'].sort_values(ascending=False).index[:400] -# save_path = './cell_wise_figures/icsp_beta.png' -# ax = scv.pl.heatmap(rpe1_kinetics, -# var_names=well_fitted_genes, -# sortby='Cell_cycle_relativePos', -# col_color='cell_cycle_phase', -# n_convolve=100, -# layer='cell_wise_beta', -# figsize=(6, 3), -# show=False) -# # plt.savefig(save_path, dpi=dpi, figsize=figsize) -# plt.show() -# -# -# # dyn.configuration.set_figure_params(fontsize=6, dpi=300) -# # genes = ['HMGA2'] -# # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_T', -# # ekey='M_t', show_arrowed_spines=False, show_quiver=True, quiver_size=5, -# # figsize=(6 * 0.53, 4 * 0.53)) -# # genes = ['DCBLD2'] -# # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_T', -# # ekey='M_t', show_arrowed_spines=False, show_quiver=True, quiver_size=5, -# # figsize=(6 * 0.53, 4 * 0.53)) -# # genes = ['HIPK2'] -# # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_T', -# # ekey='M_t', show_arrowed_spines=False, show_quiver=True, quiver_size=5, -# # figsize=(6 * 0.53, 4 * 0.53)) -# # -# # # dyn.configuration.set_figure_params(fontsize=6, dpi=300) -# # # genes = ['HMGA2'] -# # # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_S', -# # # ekey='M_s', show_arrowed_spines=False, show_quiver=True, quiver_size=5, -# # # figsize=(6 * 0.53, 4 * 0.53)) -# # # genes = ['DCBLD2'] -# # # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_S', -# # # ekey='M_s', show_arrowed_spines=False, show_quiver=True, quiver_size=5, -# # # figsize=(6 * 0.53, 4 * 0.53)) -# # # genes = ['HIPK2'] -# # # dyn.pl.phase_portraits(rpe1_kinetics, genes=genes, color='cell_cycle_phase', basis='RFP_GFP', vkey='velocity_S', -# # # ekey='M_s', show_arrowed_spines=False, show_quiver=True, quiver_size=5, -# # # figsize=(6 * 0.53, 4 * 0.53))