Single-cell resolution technologies warrant computational methods that capture cell heterogeneity while allowing efficient comparisons of populations. Here, we summarize cell populations by adding features’ measures of dispersion and covariances to population averages, in the context of morphological profiling. We find that data fusion is critical for these metrics to improve results over the prior state-of-the-art, providing ~30% better performance in tasks including predicting a compound’s mechanism of action (MoA) and a gene’s pathway.
Note:
Our paper says
Extracted image-based features are publicly available in the following s3 bucket s3://cellpainting-datasets under folders corresponding to the respective names of the datasets.
We moved the files from s3://cellpainting-datasets to s3://cellpainting-gallery/cpg0015-heterogeneity/broad/workspace/supplementary/ in June 2022.
-
Mac OS X
-
R Ver. 3.3.3
-
Following R packages: dplyr 0.7.4 magrittr 1.5 foreach 1.4.4 stringr 1.2.0 readr 1.1.1 doParallel 1.0.11 SNFtool 2.2 ggplot2 2.2.1 Matrix 1.2-8 htmlTable 1.6 readbulk 1.1.0 cytominer 0.1.0.9000 (https://github.com/cytomining/cytominer)
-
aws command line interface (https://docs.aws.amazon.com/cli/latest/userguide/cli-install-macos.html) configured to access
cellpainting-datasetss3 bucket. -
Package installation time is about an hour on a typical PC.
-
Note : For each dataset, create a separate clone of the repository. Then,
cd code. -
Note : TA-ORF-BBBC037-Rohban is the smallest dataset consisting of only around 5 plates, so can also be used for the demo purposes. Each plate takes on average between 2 to 3 hours to get processed on a normal PC. Bioactives-BBBC022-Gustafsdottir and CDRPBIO-BBBC036-Bray consist of 20 and 55 plates, respectively.
- Bioactives-BBBC022-Gustafsdottir :
parallel -j 1 './profile_trad.R --name=Bioactives-BBBC022-Gustafsdottir --batch=BBBC022_2013 --plate={1} --operation="median+mad" --col="Metadata_broad_sample" --value="DMSO" --cores=2 --feats="../input/feature_list_BBBC022.txt"' :::: ../input/processed_plates_BBBC022.txt - TA-ORF-BBBC037-Rohban :
parallel -j 1 './profile_trad.R --name=TA-ORF-BBBC037-Rohban --batch=SIGMA2_Pilot_2013_10_11 --plate={1} --operation="median+mad" --col="Metadata_ASSAY_WELL_ROLE" --value="Untreated" --cores=2 --feats="../input/feature_list.txt"' :::: ../input/processed_plates_TA.txt - CDRPBIO-BBBC036-Bray :
parallel -j 1 './profile_trad.R --name=CDRPBIO-BBBC036-Bray --batch=CDRP --plate={1} --operation="median+mad" --col="Metadata_broad_sample" --value="DMSO" --cores=2 --feats="../input/feature_list.txt"' :::: ../input/processed_plates_CDRP_bio.txt
- Bioactives-BBBC022-Gustafsdottir :
rm ../input/random_projection_unified.rds
mv ../input/random_projection_unified_BBBC022.rds ../input/random_projection_unified.rds
parallel -j 1 './profile.R --name=Bioactives-BBBC022-Gustafsdottir --batch=BBBC022_2013 --plate={1} --dim=3000 --rdensity=0.1 --core=2 --col=Metadata_broad_sample --value="DMSO" --feats="../input/feature_list_BBBC022.txt"' :::: ../input/processed_plates_BBBC022.txt
- TA-ORF-BBBC037-Rohban :
parallel -j 1 './profile.R --name=TA-ORF-BBBC037-Rohban --batch=SIGMA2_Pilot_2013_10_11 --plate={1} --dim=3000 --rdensity=0.1 --core=2 --col=Metadata_ASSAY_WELL_ROLE --value="Untreated" --feats="../input/feature_list.txt"' :::: ../input/processed_plates_TA.txt - CDRPBIO-BBBC036-Bray :
parallel -j 1 './profile.R --name=CDRPBIO-BBBC036-Bray --batch=CDRP --plate={1} --dim=3000 --rdensity=0.1 --core=2 --col=Metadata_broad_sample --value="DMSO" --feats="../input/feature_list.txt"' :::: ../input/processed_plates_CDRP_bio.txt
- Bioactives-BBBC022-Gustafsdottir :
./sample_dmso.R --name=Bioactives-BBBC022-Gustafsdottir --batch=BBBC022_2013 --plate="../input/processed_plates_BBBC022.txt" --feats="../input/feature_list_BBBC022.txt" --col="Metadata_broad_sample" --value="DMSO"
./profile_factor_analysis.R --name=Bioactives-BBBC022-Gustafsdottir --batch=BBBC022_2013 --plate="../input/processed_plates_BBBC022.txt" --feats="../input/feature_list_BBBC022.txt" --col="Metadata_broad_sample" --value="DMSO"
./evaluate_factor_analysis.R --name=Bioactives-BBBC022-Gustafsdottir --batch=BBBC022_2013 --plate="../input/processed_plates_BBBC022.txt" --feats="../input/feature_list_BBBC022.txt" --meta="../input/metadata_BBBC022.csv" --col="Metadata_broad_sample" --value="DMSO"
- TA-ORF-BBBC037-Rohban :
./sample_dmso.R --name=TA-ORF-BBBC037-Rohban --batch=SIGMA2_Pilot_2013_10_11 --plate="../input/processed_plates_TA.txt" --feats="../input/feature_list.txt" --col="Metadata_ASSAY_WELL_ROLE" --value="Untreated"
./profile_factor_analysis.R --name=TA-ORF-BBBC037-Rohban --batch=SIGMA2_Pilot_2013_10_11 --plate="../input/processed_plates_TA.txt" --feats="../input/feature_list.txt" --col="Metadata_ASSAY_WELL_ROLE" --value="Untreated"
./evaluate_factor_analysis.R --name=TA-ORF-BBBC037-Rohban --batch=SIGMA2_Pilot_2013_10_11 --plate="../input/processed_plates_TA.txt" --feats="../input/feature_list.txt" --meta="../input/metadata_TA.csv" --col="Metadata_ASSAY_WELL_ROLE" --value="Untreated"
- CDRPBIO-BBBC036-Bray :
./sample_dmso.R --name=CDRPBIO-BBBC036-Bray --batch=CDRP --plate="../input/processed_plates_CDRP_bio.txt" --feats="../input/feature_list.txt" --col="Metadata_broad_sample" --value="DMSO"
./profile_factor_analysis.R --name=CDRPBIO-BBBC036-Bray --batch=CDRP --plate="../input/processed_plates_CDRP_bio.txt" --feats="../input/feature_list.txt" --col="Metadata_broad_sample" --value="DMSO"
./evaluate_factor_analysis.R --name=CDRPBIO-BBBC036-Bray --batch=CDRP --plate="../input/processed_plates_CDRP_bio.txt" --feats="../input/feature_list.txt" --meta="../input/metadata_CDRP.csv" --col="Metadata_broad_sample" --value="DMSO"
- Bioactives-BBBC022-Gustafsdottir :
./profile_pca.R --name=Bioactives-BBBC022-Gustafsdottir --batch=BBBC022_2013 --plate="../input/processed_plates_BBBC022.txt" --feats="../input/feature_list_BBBC022.txt" --col="Metadata_broad_sample" --value="DMSO"
./evaluate_pca.R --name=Bioactives-BBBC022-Gustafsdottir --batch=BBBC022_2013 --plate="../input/processed_plates_BBBC022.txt" --feats="../input/feature_list_BBBC022.txt" --meta="../input/metadata_BBBC022.csv" --col="Metadata_broad_sample" --value="DMSO"
- TA-ORF-BBBC037-Rohban :
./profile_pca.R --name=TA-ORF-BBBC037-Rohban --batch=SIGMA2_Pilot_2013_10_11 --plate="../input/processed_plates_TA.txt" --feats="../input/feature_list.txt" --col="Metadata_ASSAY_WELL_ROLE" --value="Untreated"
./evaluate_pca.R --name=TA-ORF-BBBC037-Rohban --batch=SIGMA2_Pilot_2013_10_11 --plate="../input/processed_plates_TA.txt" --feats="../input/feature_list.txt" --meta="../input/metadata_TA.csv" --col="Metadata_ASSAY_WELL_ROLE" --value="Untreated"
- CDRPBIO-BBBC036-Bray :
./profile_pca.R --name=CDRPBIO-BBBC036-Bray --batch=CDRP --plate="../input/processed_plates_CDRP_bio.txt" --feats="../input/feature_list.txt" --col="Metadata_broad_sample" --value="DMSO"
./evaluate_pca.R --name=CDRPBIO-BBBC036-Bray --batch=CDRP --plate="../input/processed_plates_CDRP_bio.txt" --feats="../input/feature_list.txt" --meta="../input/metadata_CDRP.csv" --col="Metadata_broad_sample" --value="DMSO"
- Bioactives-BBBC022-Gustafsdottir :
./evaluate.R -m "median" -p "../input/processed_plates_BBBC022.txt" -e ../input/metadata_BBBC022.csv -f "../input/feature_list_BBBC022.txt"
./evaluate.R -m "mad" -p "../input/processed_plates_BBBC022.txt" -e ../input/metadata_BBBC022.csv -f "../input/feature_list_BBBC022.txt"
./evaluate.R -m "cov" -p "../input/processed_plates_BBBC022.txt" -e ../input/metadata_BBBC022.csv -f "../input/feature_list_BBBC022.txt"
./evaluate.R -m "median+mad" -p "../input/processed_plates_BBBC022.txt" -e ../input/metadata_BBBC022.csv -f "../input/feature_list_BBBC022.txt"
- TA-ORF-BBBC037-Rohban :
./evaluate.R -m "median" -p "../input/processed_plates_TA.txt" -e ../input/metadata_TA.csv -f "../input/feature_list.txt"
./evaluate.R -m "mad" -p "../input/processed_plates_TA.txt" -e ../input/metadata_TA.csv -f "../input/feature_list.txt"
./evaluate.R -m "cov" -p "../input/processed_plates_TA.txt" -e ../input/metadata_TA.csv -f "../input/feature_list.txt"
./evaluate.R -m "median+mad" -p "../input/processed_plates_TA.txt" -e ../input/metadata_TA.csv -f "../input/feature_list.txt"
- CDRPBIO-BBBC036-Bray :
./evaluate.R -m "median" -p "../input/processed_plates_CDRP_bio.txt" -e ../input/metadata_CDRP.csv -f "../input/feature_list.txt"
./evaluate.R -m "mad" -p "../input/processed_plates_CDRP_bio.txt" -e ../input/metadata_CDRP.csv -f "../input/feature_list.txt"
./evaluate.R -m "cov" -p "../input/processed_plates_CDRP_bio.txt" -e ../input/metadata_CDRP.csv -f "../input/feature_list.txt"
./evaluate.R -m "median+mad" -p "../input/processed_plates_CDRP_bio.txt" -e ../input/metadata_CDRP.csv -f "../input/feature_list.txt"
- Run
./compare_mean_cov.R -p chemicalfor Bioactives-BBBC022-Gustafsdottir and CDRPBIO-BBBC036-Bray - Run
./compare_mean_cov.R -p geneticfor TA-ORF-BBBC037-Rohban
- Run
sub_corr_plot.Rfor CDRPBIO-BBBC036-Bray
- Run
./compare_mean_cov_filteredMoA.R -p chemicalfor Bioactives-BBBC022-Gustafsdottir and CDRPBIO-BBBC036-Bray - Run
./compare_mean_cov_filteredMoA.R -p geneticfor TA-ORF-BBBC037-Rohban