Skip to content

Commit 2e70b73

Browse files
authored
Merge pull request #302 from BenjaminDEMAILLE/master
Add --cellranger, --gzip, --delete-bus, and --genomebam/--cram support
2 parents dadae35 + eabdca1 commit 2e70b73

File tree

2 files changed

+172
-23
lines changed

2 files changed

+172
-23
lines changed

kb_python/count.py

Lines changed: 126 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -659,7 +659,7 @@ def bustools_whitelist(
659659

660660
def matrix_to_cellranger(
661661
matrix_path: str, barcodes_path: str, genes_path: str, t2g_path: str,
662-
out_dir: str
662+
out_dir: str, gzip: bool = False
663663
) -> Dict[str, str]:
664664
"""Convert bustools count matrix to cellranger-format matrix.
665665
@@ -669,22 +669,32 @@ def matrix_to_cellranger(
669669
genes_path: Path to genes.txt
670670
t2g_path: Path to transcript-to-gene mapping
671671
out_dir: Path to output matrix
672+
gzip: Whether to gzip compress the output files, defaults to `False`
672673
673674
Returns:
674675
Dictionary of matrix files
675676
"""
676677
make_directory(out_dir)
677678
logger.info(f'Writing matrix in cellranger format to {out_dir}')
678679

679-
cr_matrix_path = os.path.join(out_dir, CELLRANGER_MATRIX)
680-
cr_barcodes_path = os.path.join(out_dir, CELLRANGER_BARCODES)
681-
cr_genes_path = os.path.join(out_dir, CELLRANGER_GENES)
680+
# Add .gz extension if gzip is enabled
681+
gz_ext = '.gz' if gzip else ''
682+
cr_matrix_path = os.path.join(out_dir, CELLRANGER_MATRIX + gz_ext)
683+
cr_barcodes_path = os.path.join(out_dir, CELLRANGER_BARCODES + gz_ext)
684+
cr_genes_path = os.path.join(out_dir, CELLRANGER_GENES + gz_ext)
682685

683686
# Cellranger outputs genes x cells matrix
684687
mtx = scipy.io.mmread(matrix_path)
685-
scipy.io.mmwrite(cr_matrix_path, mtx.T, field='integer')
686-
687-
with open(barcodes_path, 'r') as f, open(cr_barcodes_path, 'w') as out:
688+
# Write to temporary file first, then optionally gzip
689+
temp_mtx = os.path.join(out_dir, CELLRANGER_MATRIX)
690+
scipy.io.mmwrite(temp_mtx, mtx.T, field='integer')
691+
if gzip:
692+
from .utils import compress_gzip
693+
compress_gzip(temp_mtx, cr_matrix_path)
694+
os.remove(temp_mtx)
695+
696+
opener = open_as_text if gzip else open
697+
with open(barcodes_path, 'r') as f, opener(cr_barcodes_path, 'w') as out:
688698
for line in f:
689699
if line.isspace():
690700
continue
@@ -700,13 +710,14 @@ def matrix_to_cellranger(
700710
if len(split) > 2:
701711
gene_to_name[split[1]] = split[2]
702712

703-
with open(genes_path, 'r') as f, open(cr_genes_path, 'w') as out:
713+
with opener(genes_path, 'r') as f, opener(cr_genes_path, 'w') as out:
704714
for line in f:
705715
if line.isspace():
706716
continue
707-
gene = line.strip()
708-
gene_name = gene_to_name.get(gene, gene)
709-
out.write(f'{gene}\t{gene_name}\n')
717+
gene_id = line.strip()
718+
gene_symbol = gene_to_name.get(gene_id, gene_id)
719+
# CellRanger format: GENE_ID\tGENE_SYMBOL (or FEATURE_ID\tFEATURE_NAME for features)
720+
out.write(f'{gene_id}\t{gene_symbol}\n')
710721

711722
return {
712723
'mtx': cr_matrix_path,
@@ -939,6 +950,7 @@ def filter_with_bustools(
939950
h5ad: bool = False,
940951
by_name: bool = False,
941952
cellranger: bool = False,
953+
gzip: bool = False,
942954
umi_gene: bool = True,
943955
em: bool = False,
944956
) -> Dict[str, str]:
@@ -1055,7 +1067,8 @@ def filter_with_bustools(
10551067
cr_result = matrix_to_cellranger(
10561068
count_result['mtx'], count_result['barcodes'],
10571069
count_result['genes'], t2g_path,
1058-
os.path.join(counts_dir, CELLRANGER_DIR)
1070+
os.path.join(counts_dir, CELLRANGER_DIR),
1071+
gzip=gzip
10591072
)
10601073
results.update({'cellranger': cr_result})
10611074
else:
@@ -1210,6 +1223,8 @@ def count(
12101223
h5ad: bool = False,
12111224
by_name: bool = False,
12121225
cellranger: bool = False,
1226+
gzip: bool = False,
1227+
delete_bus: bool = False,
12131228
inspect: bool = True,
12141229
report: bool = False,
12151230
fragment_l: Optional[int] = None,
@@ -1275,6 +1290,10 @@ def count(
12751290
by_name: Aggregate counts by name instead of ID.
12761291
cellranger: Whether to convert the final count matrix into a
12771292
cellranger-compatible matrix, defaults to `False`
1293+
gzip: Whether to gzip compress cellranger output matrices,
1294+
defaults to `False`
1295+
delete_bus: Whether to delete intermediate BUS files after successful count,
1296+
defaults to `False`
12781297
inspect: Whether or not to inspect the output BUS file and generate
12791298
the inspect.json
12801299
report: Generate an HTMl report, defaults to `False`
@@ -1632,7 +1651,8 @@ def update_results_with_suffix(current_results, new_results, suffix):
16321651
cr_result = matrix_to_cellranger(
16331652
count_result['mtx'], count_result['barcodes'],
16341653
count_result['genes'], t2g_path,
1635-
os.path.join(counts_dir, f'{CELLRANGER_DIR}{suffix}')
1654+
os.path.join(counts_dir, f'{CELLRANGER_DIR}{suffix}'),
1655+
gzip=gzip
16361656
)
16371657
update_results_with_suffix(
16381658
unfiltered_results, {'cellranger': cr_result}, suffix
@@ -1704,6 +1724,7 @@ def update_results_with_suffix(current_results, new_results, suffix):
17041724
loom_names=loom_names,
17051725
h5ad=h5ad,
17061726
by_name=by_name,
1727+
gzip=gzip,
17071728
umi_gene=umi_gene,
17081729
em=em,
17091730
)
@@ -1735,6 +1756,34 @@ def update_results_with_suffix(current_results, new_results, suffix):
17351756
)
17361757
unfiltered_results.update(report_result)
17371758

1759+
# Delete intermediate BUS files if requested
1760+
if delete_bus:
1761+
logger.info('Deleting intermediate BUS files to save disk space')
1762+
bus_files_to_delete = []
1763+
1764+
# Collect all .bus files from results
1765+
if 'bus' in unfiltered_results:
1766+
bus_files_to_delete.append(unfiltered_results['bus'])
1767+
if 'bus_scs' in unfiltered_results:
1768+
bus_files_to_delete.append(unfiltered_results['bus_scs'])
1769+
1770+
# For smartseq3, delete suffix versions too
1771+
for suffix in ['', INTERNAL_SUFFIX, UMI_SUFFIX]:
1772+
if f'bus{suffix}' in unfiltered_results:
1773+
bus_files_to_delete.append(unfiltered_results[f'bus{suffix}'])
1774+
if f'bus_scs{suffix}' in unfiltered_results:
1775+
bus_files_to_delete.append(unfiltered_results[f'bus_scs{suffix}'])
1776+
1777+
# Delete filtered bus if exists
1778+
if 'filtered' in results and 'bus_scs' in results['filtered']:
1779+
bus_files_to_delete.append(results['filtered']['bus_scs'])
1780+
1781+
# Delete each BUS file
1782+
for bus_file in bus_files_to_delete:
1783+
if bus_file and os.path.exists(bus_file):
1784+
logger.debug(f'Deleting {bus_file}')
1785+
os.remove(bus_file)
1786+
17381787
return results
17391788

17401789

@@ -1762,6 +1811,9 @@ def count_nac(
17621811
h5ad: bool = False,
17631812
by_name: bool = False,
17641813
cellranger: bool = False,
1814+
gzip: bool = False,
1815+
cellranger_style: bool = False,
1816+
delete_bus: bool = False,
17651817
inspect: bool = True,
17661818
report: bool = False,
17671819
nucleus: bool = False,
@@ -1823,6 +1875,12 @@ def count_nac(
18231875
by_name: Aggregate counts by name instead of ID.
18241876
cellranger: Whether to convert the final count matrix into a
18251877
cellranger-compatible matrix, defaults to `False`
1878+
gzip: Whether to gzip compress cellranger output matrices,
1879+
defaults to `False`
1880+
cellranger_style: Whether to organize output in CellRanger-style directories
1881+
(spliced/ and unspliced/ subdirectories), defaults to `False`
1882+
delete_bus: Whether to delete intermediate BUS files after successful count,
1883+
defaults to `False`
18261884
inspect: Whether or not to inspect the output BUS file and generate
18271885
the inspect.json
18281886
report: Generate HTML reports, defaults to `False`
@@ -2115,12 +2173,23 @@ def update_results_with_suffix(current_results, new_results, suffix):
21152173
prefix_results, count_result[i], suffix
21162174
)
21172175
if cellranger:
2176+
# Determine output directory based on cellranger_style
2177+
if cellranger_style:
2178+
# Create spliced/unspliced subdirectories for CellRanger style
2179+
if i == 0: # processed/spliced
2180+
cr_dir = os.path.join(counts_dir, 'spliced')
2181+
elif i == 1: # unprocessed/unspliced
2182+
cr_dir = os.path.join(counts_dir, 'unspliced')
2183+
else: # ambiguous
2184+
cr_dir = os.path.join(counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}')
2185+
else:
2186+
cr_dir = os.path.join(counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}')
2187+
21182188
cr_result = matrix_to_cellranger(
21192189
count_result[i]['mtx'], count_result[i]['barcodes'],
21202190
count_result[i]['genes'], t2g_path,
2121-
os.path.join(
2122-
counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}'
2123-
)
2191+
cr_dir,
2192+
gzip=gzip
21242193
)
21252194
update_results_with_suffix(
21262195
prefix_results, {'cellranger': cr_result}, suffix
@@ -2159,7 +2228,8 @@ def update_results_with_suffix(current_results, new_results, suffix):
21592228
res['mtx'], res['barcodes'], res['genes'], t2g_path,
21602229
os.path.join(
21612230
counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}'
2162-
)
2231+
),
2232+
gzip=gzip
21632233
)
21642234
update_results_with_suffix(
21652235
prefix_results, {'cellranger': cr_result}, suffix
@@ -2278,12 +2348,23 @@ def update_results_with_suffix(current_results, new_results, suffix):
22782348
)
22792349
})
22802350
if cellranger:
2351+
# Determine output directory based on cellranger_style
2352+
if cellranger_style:
2353+
# Create spliced/unspliced subdirectories for CellRanger style
2354+
if i == 0: # processed/spliced
2355+
cr_dir = os.path.join(filtered_counts_dir, 'spliced')
2356+
elif i == 1: # unprocessed/unspliced
2357+
cr_dir = os.path.join(filtered_counts_dir, 'unspliced')
2358+
else: # ambiguous
2359+
cr_dir = os.path.join(filtered_counts_dir, f'{CELLRANGER_DIR}_{prefix}')
2360+
else:
2361+
cr_dir = os.path.join(filtered_counts_dir, f'{CELLRANGER_DIR}_{prefix}')
2362+
22812363
cr_result = matrix_to_cellranger(
22822364
count_result[i]['mtx'], count_result[i]['barcodes'],
22832365
count_result[i]['genes'], t2g_path,
2284-
os.path.join(
2285-
filtered_counts_dir, f'{CELLRANGER_DIR}_{prefix}'
2286-
)
2366+
cr_dir,
2367+
gzip=gzip
22872368
)
22882369
filtered_results[prefix].update({'cellranger': cr_result})
22892370
filtered_results[prefix].update(count_result[i])
@@ -2319,7 +2400,8 @@ def update_results_with_suffix(current_results, new_results, suffix):
23192400
os.path.join(
23202401
filtered_counts_dir,
23212402
f'{CELLRANGER_DIR}_{prefix}'
2322-
)
2403+
),
2404+
gzip=gzip
23232405
)
23242406
filtered_results[prefix].update({
23252407
'cellranger': cr_result
@@ -2402,6 +2484,29 @@ def update_results_with_suffix(current_results, new_results, suffix):
24022484
'Plots for TCC matrices have not yet been implemented. The HTML report will not contain any plots.'
24032485
)
24042486

2487+
# Delete intermediate BUS files if requested
2488+
if delete_bus:
2489+
logger.info('Deleting intermediate BUS files to save disk space')
2490+
bus_files_to_delete = []
2491+
2492+
# Collect all .bus files from results
2493+
prefixes = ['processed', 'unprocessed', 'ambiguous']
2494+
for prefix in prefixes:
2495+
if prefix in unfiltered_results:
2496+
for suffix in ['', INTERNAL_SUFFIX, UMI_SUFFIX]:
2497+
if f'bus{suffix}' in unfiltered_results[prefix]:
2498+
bus_files_to_delete.append(unfiltered_results[prefix][f'bus{suffix}'])
2499+
2500+
# Delete filtered bus files if they exist
2501+
if 'filtered' in results and 'bus_scs' in results['filtered']:
2502+
bus_files_to_delete.append(results['filtered']['bus_scs'])
2503+
2504+
# Delete each BUS file
2505+
for bus_file in bus_files_to_delete:
2506+
if bus_file and os.path.exists(bus_file):
2507+
logger.debug(f'Deleting {bus_file}')
2508+
os.remove(bus_file)
2509+
24052510
return results
24062511

24072512

kb_python/main.py

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -591,6 +591,11 @@ def parse_count(
591591
parser.error(
592592
f'Option `--aa` cannot be used with workflow {args.workflow}.'
593593
)
594+
595+
# Auto-enable gzip and cellranger-style when --cellranger is used
596+
use_gzip = args.cellranger and not args.no_gzip or args.gzip
597+
use_cellranger_style = args.cellranger
598+
594599
from .count import count_nac
595600
count_nac(
596601
args.i,
@@ -613,6 +618,9 @@ def parse_count(
613618
loom_names=loom_names,
614619
h5ad=args.h5ad,
615620
cellranger=args.cellranger,
621+
gzip=use_gzip,
622+
cellranger_style=use_cellranger_style,
623+
delete_bus=args.delete_bus,
616624
report=args.report,
617625
inspect=not args.no_inspect,
618626
temp_dir=temp_dir,
@@ -711,6 +719,9 @@ def parse_count(
711719
'`kite:10xFB` workflow is only supported with technology `10XV3`'
712720
)
713721

722+
# Auto-enable gzip when --cellranger is used (unless --no-gzip is specified)
723+
use_gzip = (args.cellranger and not args.no_gzip) or args.gzip
724+
714725
from .count import count
715726
count(
716727
args.i,
@@ -733,6 +744,8 @@ def parse_count(
733744
loom_names=loom_names,
734745
h5ad=args.h5ad,
735746
cellranger=args.cellranger,
747+
gzip=use_gzip,
748+
delete_bus=args.delete_bus,
736749
report=args.report,
737750
inspect=not args.no_inspect,
738751
temp_dir=temp_dir,
@@ -1286,7 +1299,19 @@ def setup_count_args(
12861299
)
12871300
parser_count.add_argument(
12881301
'--genomebam',
1289-
help=argparse.SUPPRESS,
1302+
help=(
1303+
'Generate genome-aligned BAM file from pseudoalignments. '
1304+
'Requires --gtf to be specified. --chromosomes is recommended.'
1305+
),
1306+
action='store_true',
1307+
default=False,
1308+
)
1309+
parser_count.add_argument(
1310+
'--cram',
1311+
help=(
1312+
'Convert BAM output to CRAM format (requires --genomebam). '
1313+
'CRAM provides better compression than BAM.'
1314+
),
12901315
action='store_true',
12911316
default=False,
12921317
)
@@ -1428,7 +1453,26 @@ def setup_count_args(
14281453
)
14291454
parser_count.add_argument(
14301455
'--cellranger',
1431-
help='Convert count matrices to cellranger-compatible format',
1456+
help=(
1457+
'Convert count matrices to cellranger-compatible format. '
1458+
'For nac/lamanno workflows, automatically creates spliced/ and unspliced/ subdirectories. '
1459+
'Gzip compression is enabled by default (use --no-gzip to disable)'
1460+
),
1461+
action='store_true'
1462+
)
1463+
parser_count.add_argument(
1464+
'--gzip',
1465+
help='Gzip compress output matrices (matrix.mtx.gz, barcodes.tsv.gz, genes.tsv.gz). Automatically enabled with --cellranger',
1466+
action='store_true'
1467+
)
1468+
parser_count.add_argument(
1469+
'--no-gzip',
1470+
help='Disable gzip compression for cellranger matrices',
1471+
action='store_true'
1472+
)
1473+
parser_count.add_argument(
1474+
'--delete-bus',
1475+
help='Delete intermediate BUS files after successful count to save disk space',
14321476
action='store_true'
14331477
)
14341478
parser_count.add_argument(

0 commit comments

Comments
 (0)