Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions EAID_000083/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
data
plots
figures
out
misc_out
35 changes: 35 additions & 0 deletions EAID_000083/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# ENCODE Manual Analyses: Pancreas RNA & Multiome Datasets

Analyses for integrating ENCODE scRNA and 10x Multiome datasets

Contact information:
Ben Parks
[email protected]

## Running the analyses

All analysis steps from data download to processed outputs are summarized in the `run_all.sh` script.

Note that this script may take very long to run end-to-end, and assume high memory and CPU count availability.
It is recommended to run the steps one at a time in a supervized manner.

Standardized outputs are saved in the `out` folder.
Figures are saved in the `figures` folder.
Intermediate data objects are stored in the `data` folder.

## Analysis Steps
- Cells are filtered based on fragment count, TSS enrichment, and/or UMI count (parameters in config/sample_cutoffs.tsv)
- Run sctransform followed by Seurat anchor integration for the RNA (code in `analysis/01b_integrate_samples.R`)
- Use a combination of Seurat alignment with outside datasets (Tabula Sapiens, Azimuth) and
marker gene analysis (via Seurat AddModuleScore) to attach cell types to the unbiased clusters

## Figure descriptions
- `Azimuth_clusters.pdf` Heatmap of jaccard overlap for cell types annotated by Azimuth alinment vs.
unbiased cluster IDs from initial integration
- `Azimuth_modules.pdf` Dotplot of module scores for Azimuth marker genes vs. unbiased cluter IDs from
initial integration
- `Hubmap_markers.pdf` Dotplot of gene expression for selected markers from the Hubmap ASCT+B tables vs.
unbiased cluster IDs from initial integration
- `TS_cluters.pdf` Same as `Azimuth_clusters.pdf`, but with alignment to Tabula Sapiens rather than Azimuth
- `umap_annotated_clusts.png` UMAP plot of the integrated datasets (based on RNA), labeled by annotated cell type
- `umap_automated_clusts.png` UMAP plot of the integrated datasets (based on RNA), labeled by automated cluter ID
137 changes: 137 additions & 0 deletions EAID_000083/analysis/00a_export_qc_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
suppressPackageStartupMessages({
library(tidyverse)
library(data.table)
library(dtplyr)
library(parallel)
library(Matrix)
})
source("analysis/utils.R")

meta <- read_tsv("config/panc_samples.tsv", col_types="cccccc")
cutoffs <-read_tsv("config/sample_cutoffs.tsv")

panc_all <- meta$id
panc_multiome <- meta %>% filter(multiome_series != "n/a") %>% pull(id)

gencode <- read_gtf("data/gencode.v29.annotation.gtf.gz", attributes=c("gene_type", "gene_id", "gene_name"), skip="chr1") %>%
as_tibble() %>%
filter(feature == "gene")
mito_genes <- gencode %>% filter(seqnames == "chrM") %>% pull(gene_id)
ribo_genes <- gencode %>% filter(gene_type == "rRNA") %>% pull(gene_id)

# rna_dataset: ENCODE scRNA-Seq dataset ID
# rna_barcode: scRNA-Seq barcode
# rna_frac_mito: Fraction of mitochondrial RNA reads
# rna_frac_ribo: Fraction of ribosomal RNA reads
# rna_umi_count: scRNA UMI’s per cell
rna_qc <- mclapply(panc_all, mc.cores=6, function(s) {
mat <- readRDS(sprintf("data/seurat/%s/rna_raw.rds", s))
tibble(
sample_name = s,
rna_barcode = colnames(mat),
cell_id = sprintf("%s#%s", sample_name, rna_barcode),
rna_umi_count = colSums(mat),
rna_frac_mito = colSums(mat[mito_genes,]) / rna_umi_count,
rna_frac_ribo = colSums(mat[ribo_genes,]) / rna_umi_count
)
}) %>% bind_rows()

# rna_dataset: ENCODE scRNA-Seq dataset ID
rna_qc <- rna_qc %>%
inner_join(
select(meta, rna_dataset=rna_id, sample_name=id, multiome_series),
by = "sample_name"
) %>%
mutate(multiome_series=na_if(multiome_series, "n/a"))

# atac_fragment_count: snATAC-Seq fragments per cell
# atac_tss_enrichment: snATAC-Seq transcription start site enrichment
atac_qc <- mclapply(panc_all, mc.cores=6, function(s) {
atac_qc <- readRDS(sprintf("data/archr/%1$s/ArchR_Project_unfiltered/QualityControl/%1$s/%1$s-Pre-Filter-Metadata.rds", s))
tibble(
sample_name = s,
cell_id = atac_qc$cellNames,
atac_fragment_count = atac_qc$nFrags,
atac_tss_enrichment = atac_qc$TSSEnrichment
)
}) %>% bind_rows()


reverse_complement <- function(x) {
stringi::stri_reverse(x) %>%
toupper() %>%
gsub("A", "t", .) %>%
gsub("T", "a", .) %>%
gsub("G", "c", .) %>%
gsub("C", "g", .) %>%
toupper()
}

# atac_barcode: snATAC-Seq barcode
atac_qc <- atac_qc %>%
mutate(rna = str_remove(cell_id, "^[^#]+#")) %>%
left_join(read_tsv("config/10x_barcode_translation.txt.gz", col_types="cc"), by="rna") %>%
mutate(atac_barcode = if_else(is.na(atac), rna, reverse_complement(atac))) %>%
select(!c(rna, atac))
# atac_dataset: ENCODE snATAC-Seq dataset ID
atac_qc <- atac_qc %>%
inner_join(
select(meta, atac_dataset=atac_id, sample_name=id, multiome_series),
by = "sample_name"
) %>%
mutate(multiome_series=na_if(multiome_series, "n/a"))

joint_qc <- mclapply(panc_all, mc.cores=6, function(s) {
m <- meta %>% filter(id == s)
rna_qc <- filter(rna_qc, rna_dataset == m$rna_id)
atac_qc <- filter(atac_qc, atac_dataset == m$atac_id)
if (m$multiome_series == "n/a") {
bind_rows(rna_qc, atac_qc)
} else {
rna_qc %>%
lazy_dt() %>%
select(!multiome_series) %>%
full_join(atac_qc, by=c("sample_name", "cell_id")) %>%
select(!multiome_series) %>%
mutate(
across(c(rna_umi_count, rna_frac_mito, rna_frac_ribo, atac_fragment_count, atac_tss_enrichment),
~ replace_na(.x, 0))
) %>%
mutate(
rna_dataset = m$rna_id,
atac_dataset = m$atac_id
) %>%
collect()
}
}) %>% bind_rows()


final_qc <- joint_qc %>%
inner_join(cutoffs, by=c("sample_name"="sample_id")) %>%
mutate(
passed_atac = atac_fragment_count >= min_nFrags & atac_tss_enrichment >= 5,
passed_rna = rna_umi_count >= min_umis,
passed_filtering = (passed_atac | is.na(atac_dataset)) & (passed_rna | is.na(rna_dataset))
) %>%
select(cell_id, sample_name, rna_dataset, rna_barcode, atac_dataset, atac_barcode,
rna_umi_count, atac_fragment_count,
rna_frac_mito, rna_frac_ribo, atac_tss_enrichment,
passed_filtering)

header_text <- c(
"# cell_id: Cell ID used in integrated analysis",
"# rna_dataset: ENCODE snRNA-Seq dataset ID",
"# rna_barcode: snRNA-Seq barcode",
"# atac_dataset: ENCODE snATAC-Seq dataset ID",
"# atac_barcode: snATAC-Seq barcode",
"# rna_umi_count: snRNA UMI's per cell",
"# atac_fragment_count: snATAC-Seq fragments per cell",
"# rna_frac_mito: Fraction of mitochondrial RNA reads",
"# rna_frac_ribo: Fraction of ribosomal RNA reads",
"# atac_tss_enrichment: snATAC-Seq transcription start site enrichment",
"# passed_filtering: A binary (0/1) value indicating whether the cell passed manual filtering"
)


write_lines(header_text, "out/metadata.tsv.gz")
write_tsv(final_qc, "out/metadata.tsv.gz", append=TRUE, col_names=TRUE)
72 changes: 72 additions & 0 deletions EAID_000083/analysis/00b_raw_rna_and_qc_plots.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
suppressPackageStartupMessages({
library(tidyverse)
library(Seurat)
library(SeuratDisk)
library(patchwork)
library(data.table)
library(GenomicRanges)
})
# This is adapted from 05_analysis/meetings/Mar30/PANC_analysis.R
# The aim is just to produce a list of Seurat objects with the filtered cells +
# raw counts matrices (subsetting to protein_coding, lincrna, and TR/IG genes)
source("analysis/utils.R")

meta <- read_tsv("config/panc_samples.tsv")
cutoffs <-read_tsv("config/sample_cutoffs.tsv")

panc_all <- meta %>% pull(id)
panc_multiome <- meta %>% filter(multiome_series != "n/a") %>% pull(id)


qc <- fread("out/metadata.tsv.gz", skip="cell_id\tsample_name")


tss_plots <- map(panc_all, function(s) {
c <- filter(cutoffs, sample_id == s)
cutoff_plot_hexbin(filter(qc, sample_name == s), log10(atac_fragment_count), atac_tss_enrichment, log10(c$min_nFrags), 5) +
theme_bw() + ggtitle(s) + ylim(0, 30)
})
tss_plot <- patchwork::wrap_plots(tss_plots, ncol=3)

rna_count_plots <- map(panc_all, function(s) {
c <- filter(cutoffs, sample_id == s)
read_count_cutoff_plot(filter(qc, sample_name == s) %>% pull(rna_umi_count), c$min_umis) +
theme_bw() + ggtitle(s)
})
rna_count_plot <- patchwork::wrap_plots(rna_count_plots, ncol=3)

gene_metadata <- read_tsv("data/ENCODE/samples/W61_PANC/rna/features.tsv.gz", col_names = c("id", "symbol", "type"))

gencode <- read_gtf("data/gencode.v29.annotation.gtf.gz", attributes=c("gene_type", "gene_id", "gene_name"), skip="chr1") %>%
as_tibble() %>%
filter(feature == "gene")

keeper_genes <- gencode %>%
filter(gene_type %in% c("protein_coding", "lincRNA") | str_detect(gene_type, "^IG_|TR_"))


rna_mats <- map(panc_all, function(s) {
c <- filter(cutoffs, sample_id == s)
passing_cells <- rna_qc %>%
filter(umis >= c$min_umis, sample_id == s) %>%
pull(cell_barcode)
passing_cells_atac <- atac_qc %>%
filter(TSSEnrichment >= 5, sample_id == s, nFrags >= c$min_nFrags) %>%
pull(cell_barcode)
if (filter(meta, id == s)[["multiome_series"]] != "n/a")
passing_cells <- intersect(passing_cells, passing_cells_atac)
mat <- readRDS(sprintf("data/seurat/%s/rna_raw.rds", s))[keeper_genes$gene_id,passing_cells]
rownames(mat) <- keeper_genes$gene_name
mat
})

seurat_raw <- mclapply(seq_along(rna_mats), mc.cores=4, function(i) {
mat <- rna_mats[[i]]
sample <- panc_all[i]
meta.data <- data.frame(sample=rep_len(sample, ncol(mat)))
rownames(meta.data) <- colnames(mat)
CreateSeuratObject(mat, project=sample, meta.data=meta.data)
})

saveRDS(seurat_raw, "data/seurat_raw.rds")

40 changes: 40 additions & 0 deletions EAID_000083/analysis/01a_integrate_tabula_sapiens.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
suppressPackageStartupMessages({
library(tidyverse)
library(Seurat)
library(SeuratDisk)
library(patchwork)
library(data.table)
library(GenomicRanges)
})
# Prevent BLAS ops from trying to oversubscribe cores
RhpcBLASctl::blas_set_num_threads(2)

seurat_vanilla <- readRDS("data/seurat_raw.rds")

ts_panc <-LoadH5Seurat(
"data/TS_Pancreas.h5seurat",
assays = c("RNA"),
misc = FALSE,
tools = FALSE
)
ts_panc <- ts_panc %>%
NormalizeData() %>%
FindVariableFeatures()

# About 80 seconds
system.time(
anchors <- mclapply(seurat_vanilla, mc.cores = 8, function(p) {
features <- SelectIntegrationFeatures(list(ts_panc, p))
anchors <- FindTransferAnchors(ts_panc, p, features = features)
anchors
})
)

cell_labels <- map(seq_along(anchors), function(i) tibble(
cell_label=TransferData(anchors[[i]], ts_panc$cell_ontology_class)$predicted.id,
sample = seurat_vanilla[[i]]$sample,
cell_id = colnames(seurat_vanilla[[i]])
)) %>%
bind_rows()

write_tsv(cell_labels, "data/tabula_sapiens_cell_labels.tsv")
Loading