ENCODE-AWG · bnprks · Oct 24, 2022 · Nov 21, 2022
diff --git a/EAID_000083/.gitignore b/EAID_000083/.gitignore
@@ -0,0 +1,5 @@
+data
+plots
+figures
+out
+misc_out
diff --git a/EAID_000083/README.md b/EAID_000083/README.md
@@ -0,0 +1,35 @@
+# ENCODE Manual Analyses: Pancreas RNA & Multiome Datasets
+
+Analyses for integrating ENCODE scRNA and 10x Multiome datasets
+
+Contact information: 
+Ben Parks 
+[email protected]
+
+## Running the analyses
+
+All analysis steps from data download to processed outputs are summarized in the `run_all.sh` script.
+
+Note that this script may take very long to run end-to-end, and assume high memory and CPU count availability.
+It is recommended to run the steps one at a time in a supervized manner.
+
+Standardized outputs are saved in the `out` folder.
+Figures are saved in the `figures` folder.
+Intermediate data objects are stored in the `data` folder.
+
+## Analysis Steps
+- Cells are filtered based on fragment count, TSS enrichment, and/or UMI count (parameters in config/sample_cutoffs.tsv)
+- Run sctransform followed by Seurat anchor integration for the RNA (code in `analysis/01b_integrate_samples.R`)
+- Use a combination of Seurat alignment with outside datasets (Tabula Sapiens, Azimuth) and
+  marker gene analysis (via Seurat AddModuleScore) to attach cell types to the unbiased clusters
+
+## Figure descriptions
+- `Azimuth_clusters.pdf` Heatmap of jaccard overlap for cell types annotated by Azimuth alinment vs.
+   unbiased cluster IDs from initial integration
+- `Azimuth_modules.pdf` Dotplot of module scores for Azimuth marker genes vs. unbiased cluter IDs from
+   initial integration
+- `Hubmap_markers.pdf` Dotplot of gene expression for selected markers from the Hubmap ASCT+B tables vs.
+   unbiased cluster IDs from initial integration
+- `TS_cluters.pdf` Same as `Azimuth_clusters.pdf`, but with alignment to Tabula Sapiens rather than Azimuth
+- `umap_annotated_clusts.png` UMAP plot of the integrated datasets (based on RNA), labeled by annotated cell type
+- `umap_automated_clusts.png` UMAP plot of the integrated datasets (based on RNA), labeled by automated cluter ID
diff --git a/EAID_000083/analysis/00a_export_qc_data.R b/EAID_000083/analysis/00a_export_qc_data.R
@@ -0,0 +1,137 @@
+suppressPackageStartupMessages({
+  library(tidyverse)
+  library(data.table)
+  library(dtplyr)
+  library(parallel)
+  library(Matrix)
+})
+source("analysis/utils.R")
+
+meta <- read_tsv("config/panc_samples.tsv", col_types="cccccc")
+cutoffs <-read_tsv("config/sample_cutoffs.tsv")
+
+panc_all <- meta$id
+panc_multiome <- meta %>% filter(multiome_series != "n/a") %>% pull(id)
+
+gencode <- read_gtf("data/gencode.v29.annotation.gtf.gz", attributes=c("gene_type", "gene_id", "gene_name"), skip="chr1") %>%
+  as_tibble() %>%
+  filter(feature == "gene")
+mito_genes <- gencode %>% filter(seqnames == "chrM") %>% pull(gene_id)
+ribo_genes <- gencode %>% filter(gene_type == "rRNA") %>% pull(gene_id)
+
+# rna_dataset: ENCODE scRNA-Seq dataset ID
+# rna_barcode: scRNA-Seq barcode
+# rna_frac_mito: Fraction of mitochondrial RNA reads
+# rna_frac_ribo: Fraction of ribosomal RNA reads
+# rna_umi_count: scRNA UMI’s per cell
+rna_qc <- mclapply(panc_all, mc.cores=6, function(s) {
+  mat <- readRDS(sprintf("data/seurat/%s/rna_raw.rds", s))
+  tibble(
+    sample_name = s,
+    rna_barcode = colnames(mat),
+    cell_id = sprintf("%s#%s", sample_name, rna_barcode),
+    rna_umi_count = colSums(mat),
+    rna_frac_mito = colSums(mat[mito_genes,]) / rna_umi_count,
+    rna_frac_ribo = colSums(mat[ribo_genes,]) / rna_umi_count
+  )
+}) %>% bind_rows()
+
+# rna_dataset: ENCODE scRNA-Seq dataset ID
+rna_qc <- rna_qc %>%
+  inner_join(
+    select(meta, rna_dataset=rna_id, sample_name=id, multiome_series),
+    by = "sample_name"
+  ) %>%
+  mutate(multiome_series=na_if(multiome_series, "n/a"))
+
+# atac_fragment_count: snATAC-Seq fragments per cell
+# atac_tss_enrichment: snATAC-Seq transcription start site enrichment
+atac_qc <- mclapply(panc_all, mc.cores=6, function(s) {
+  atac_qc <- readRDS(sprintf("data/archr/%1$s/ArchR_Project_unfiltered/QualityControl/%1$s/%1$s-Pre-Filter-Metadata.rds", s))
+  tibble(
+    sample_name = s,
+    cell_id = atac_qc$cellNames,
+    atac_fragment_count = atac_qc$nFrags,
+    atac_tss_enrichment = atac_qc$TSSEnrichment
+  )
+}) %>% bind_rows()
+
+
+reverse_complement <- function(x) {
+  stringi::stri_reverse(x) %>%
+    toupper() %>%
+    gsub("A", "t", .) %>%
+    gsub("T", "a", .) %>%
+    gsub("G", "c", .) %>%
+    gsub("C", "g", .) %>%
+    toupper()
+}
+
+# atac_barcode: snATAC-Seq barcode
+atac_qc <- atac_qc %>%
+  mutate(rna = str_remove(cell_id, "^[^#]+#")) %>%
+  left_join(read_tsv("config/10x_barcode_translation.txt.gz", col_types="cc"), by="rna") %>%
+  mutate(atac_barcode = if_else(is.na(atac), rna, reverse_complement(atac))) %>%
+  select(!c(rna, atac))
+# atac_dataset: ENCODE snATAC-Seq dataset ID
+atac_qc <- atac_qc %>%
+  inner_join(
+    select(meta, atac_dataset=atac_id, sample_name=id, multiome_series),
+    by = "sample_name"
+  ) %>%
+  mutate(multiome_series=na_if(multiome_series, "n/a"))
+
+joint_qc <- mclapply(panc_all, mc.cores=6, function(s) {
+  m <- meta %>% filter(id == s)
+  rna_qc <- filter(rna_qc, rna_dataset == m$rna_id)
+  atac_qc <- filter(atac_qc, atac_dataset == m$atac_id)
+  if (m$multiome_series == "n/a") {
+    bind_rows(rna_qc, atac_qc)
+  } else {
+    rna_qc %>%
+      lazy_dt() %>%
+      select(!multiome_series) %>%
+      full_join(atac_qc, by=c("sample_name", "cell_id")) %>%
+      select(!multiome_series) %>%
+      mutate(
+        across(c(rna_umi_count, rna_frac_mito, rna_frac_ribo, atac_fragment_count, atac_tss_enrichment),
+               ~ replace_na(.x, 0))
+      ) %>%
+      mutate(
+        rna_dataset = m$rna_id,
+        atac_dataset = m$atac_id
+      ) %>%
+      collect()
+  }
+}) %>% bind_rows()
+
+
+final_qc <- joint_qc %>%
+  inner_join(cutoffs, by=c("sample_name"="sample_id")) %>%
+  mutate(
+    passed_atac = atac_fragment_count >= min_nFrags & atac_tss_enrichment >= 5,
+    passed_rna = rna_umi_count >= min_umis,
+    passed_filtering = (passed_atac | is.na(atac_dataset)) & (passed_rna | is.na(rna_dataset))
+  ) %>% 
+  select(cell_id, sample_name, rna_dataset, rna_barcode, atac_dataset, atac_barcode, 
+         rna_umi_count, atac_fragment_count, 
+         rna_frac_mito, rna_frac_ribo, atac_tss_enrichment, 
+         passed_filtering)
+
+header_text <- c(
+  "# cell_id: Cell ID used in integrated analysis",
+  "# rna_dataset: ENCODE snRNA-Seq dataset ID",
+  "# rna_barcode: snRNA-Seq barcode",
+  "# atac_dataset: ENCODE snATAC-Seq dataset ID",
+  "# atac_barcode: snATAC-Seq barcode",
+  "# rna_umi_count: snRNA UMI's per cell",
+  "# atac_fragment_count: snATAC-Seq fragments per cell",
+  "# rna_frac_mito: Fraction of mitochondrial RNA reads",
+  "# rna_frac_ribo: Fraction of ribosomal RNA reads",
+  "# atac_tss_enrichment: snATAC-Seq transcription start site enrichment",
+  "# passed_filtering: A binary (0/1) value indicating whether the cell passed manual filtering"
+)
+
+
+write_lines(header_text, "out/metadata.tsv.gz")
+write_tsv(final_qc, "out/metadata.tsv.gz", append=TRUE, col_names=TRUE)
diff --git a/EAID_000083/analysis/00b_raw_rna_and_qc_plots.R b/EAID_000083/analysis/00b_raw_rna_and_qc_plots.R
@@ -0,0 +1,72 @@
+suppressPackageStartupMessages({
+  library(tidyverse)
+  library(Seurat)
+  library(SeuratDisk)
+  library(patchwork)
+  library(data.table)
+  library(GenomicRanges)
+})
+# This is adapted from 05_analysis/meetings/Mar30/PANC_analysis.R
+# The aim is just to produce a list of Seurat objects with the filtered cells + 
+# raw counts matrices (subsetting to protein_coding, lincrna, and TR/IG genes)
+source("analysis/utils.R")
+
+meta <- read_tsv("config/panc_samples.tsv")
+cutoffs <-read_tsv("config/sample_cutoffs.tsv")
+
+panc_all <- meta %>% pull(id)
+panc_multiome <- meta %>% filter(multiome_series != "n/a") %>% pull(id)
+
+
+qc <- fread("out/metadata.tsv.gz", skip="cell_id\tsample_name") 
+
+
+tss_plots <- map(panc_all, function(s) {
+  c <- filter(cutoffs, sample_id == s)
+  cutoff_plot_hexbin(filter(qc, sample_name == s), log10(atac_fragment_count), atac_tss_enrichment, log10(c$min_nFrags), 5) +
+    theme_bw() + ggtitle(s) + ylim(0, 30)
+})
+tss_plot <- patchwork::wrap_plots(tss_plots, ncol=3)
+
+rna_count_plots <- map(panc_all, function(s) {
+  c <- filter(cutoffs, sample_id == s)
+  read_count_cutoff_plot(filter(qc, sample_name == s) %>% pull(rna_umi_count), c$min_umis) +
+    theme_bw() + ggtitle(s)
+})
+rna_count_plot <- patchwork::wrap_plots(rna_count_plots, ncol=3)
+
+gene_metadata <- read_tsv("data/ENCODE/samples/W61_PANC/rna/features.tsv.gz", col_names = c("id", "symbol", "type"))
+
+gencode <- read_gtf("data/gencode.v29.annotation.gtf.gz", attributes=c("gene_type", "gene_id", "gene_name"), skip="chr1") %>%
+  as_tibble() %>%
+  filter(feature == "gene")
+
+keeper_genes <- gencode %>%
+  filter(gene_type %in% c("protein_coding", "lincRNA") | str_detect(gene_type, "^IG_|TR_"))
+
+
+rna_mats <- map(panc_all, function(s) {
+  c <- filter(cutoffs, sample_id == s)
+  passing_cells <- rna_qc %>%
+    filter(umis >= c$min_umis, sample_id == s) %>%
+    pull(cell_barcode)
+  passing_cells_atac <- atac_qc %>% 
+    filter(TSSEnrichment >= 5, sample_id == s, nFrags >= c$min_nFrags) %>%
+    pull(cell_barcode)
+  if (filter(meta, id == s)[["multiome_series"]] != "n/a")
+    passing_cells <- intersect(passing_cells, passing_cells_atac)
+  mat <- readRDS(sprintf("data/seurat/%s/rna_raw.rds", s))[keeper_genes$gene_id,passing_cells]
+  rownames(mat) <- keeper_genes$gene_name
+  mat
+})
+
+seurat_raw <- mclapply(seq_along(rna_mats), mc.cores=4, function(i) {
+  mat <- rna_mats[[i]]
+  sample <- panc_all[i]
+  meta.data <- data.frame(sample=rep_len(sample, ncol(mat)))
+  rownames(meta.data) <- colnames(mat)
+  CreateSeuratObject(mat, project=sample, meta.data=meta.data)
+})
+
+saveRDS(seurat_raw, "data/seurat_raw.rds")
+
diff --git a/EAID_000083/analysis/01a_integrate_tabula_sapiens.R b/EAID_000083/analysis/01a_integrate_tabula_sapiens.R
@@ -0,0 +1,40 @@
+suppressPackageStartupMessages({
+  library(tidyverse)
+  library(Seurat)
+  library(SeuratDisk)
+  library(patchwork)
+  library(data.table)
+  library(GenomicRanges)
+})
+# Prevent BLAS ops from trying to oversubscribe cores
+RhpcBLASctl::blas_set_num_threads(2)
+
+seurat_vanilla <- readRDS("data/seurat_raw.rds")
+
+ts_panc <-LoadH5Seurat(
+  "data/TS_Pancreas.h5seurat",
+  assays = c("RNA"),
+  misc = FALSE,
+  tools = FALSE
+)
+ts_panc <- ts_panc %>%
+  NormalizeData() %>%
+  FindVariableFeatures()
+
+# About 80 seconds
+system.time(
+  anchors <- mclapply(seurat_vanilla, mc.cores = 8, function(p) {
+    features <- SelectIntegrationFeatures(list(ts_panc, p))
+    anchors <- FindTransferAnchors(ts_panc, p, features = features)
+    anchors
+  })
+)
+
+cell_labels <- map(seq_along(anchors), function(i) tibble(
+    cell_label=TransferData(anchors[[i]], ts_panc$cell_ontology_class)$predicted.id,
+    sample = seurat_vanilla[[i]]$sample,
+    cell_id = colnames(seurat_vanilla[[i]])
+  )) %>%
+  bind_rows() 
+
+write_tsv(cell_labels, "data/tabula_sapiens_cell_labels.tsv")