@@ -659,7 +659,7 @@ def bustools_whitelist(
659659
660660def matrix_to_cellranger (
661661 matrix_path : str , barcodes_path : str , genes_path : str , t2g_path : str ,
662- out_dir : str
662+ out_dir : str , gzip : bool = False
663663) -> Dict [str , str ]:
664664 """Convert bustools count matrix to cellranger-format matrix.
665665
@@ -669,22 +669,32 @@ def matrix_to_cellranger(
669669 genes_path: Path to genes.txt
670670 t2g_path: Path to transcript-to-gene mapping
671671 out_dir: Path to output matrix
672+ gzip: Whether to gzip compress the output files, defaults to `False`
672673
673674 Returns:
674675 Dictionary of matrix files
675676 """
676677 make_directory (out_dir )
677678 logger .info (f'Writing matrix in cellranger format to { out_dir } ' )
678679
679- cr_matrix_path = os .path .join (out_dir , CELLRANGER_MATRIX )
680- cr_barcodes_path = os .path .join (out_dir , CELLRANGER_BARCODES )
681- cr_genes_path = os .path .join (out_dir , CELLRANGER_GENES )
680+ # Add .gz extension if gzip is enabled
681+ gz_ext = '.gz' if gzip else ''
682+ cr_matrix_path = os .path .join (out_dir , CELLRANGER_MATRIX + gz_ext )
683+ cr_barcodes_path = os .path .join (out_dir , CELLRANGER_BARCODES + gz_ext )
684+ cr_genes_path = os .path .join (out_dir , CELLRANGER_GENES + gz_ext )
682685
683686 # Cellranger outputs genes x cells matrix
684687 mtx = scipy .io .mmread (matrix_path )
685- scipy .io .mmwrite (cr_matrix_path , mtx .T , field = 'integer' )
686-
687- with open (barcodes_path , 'r' ) as f , open (cr_barcodes_path , 'w' ) as out :
688+ # Write to temporary file first, then optionally gzip
689+ temp_mtx = os .path .join (out_dir , CELLRANGER_MATRIX )
690+ scipy .io .mmwrite (temp_mtx , mtx .T , field = 'integer' )
691+ if gzip :
692+ from .utils import compress_gzip
693+ compress_gzip (temp_mtx , cr_matrix_path )
694+ os .remove (temp_mtx )
695+
696+ opener = open_as_text if gzip else open
697+ with open (barcodes_path , 'r' ) as f , opener (cr_barcodes_path , 'w' ) as out :
688698 for line in f :
689699 if line .isspace ():
690700 continue
@@ -700,13 +710,14 @@ def matrix_to_cellranger(
700710 if len (split ) > 2 :
701711 gene_to_name [split [1 ]] = split [2 ]
702712
703- with open (genes_path , 'r' ) as f , open (cr_genes_path , 'w' ) as out :
713+ with opener (genes_path , 'r' ) as f , opener (cr_genes_path , 'w' ) as out :
704714 for line in f :
705715 if line .isspace ():
706716 continue
707- gene = line .strip ()
708- gene_name = gene_to_name .get (gene , gene )
709- out .write (f'{ gene } \t { gene_name } \n ' )
717+ gene_id = line .strip ()
718+ gene_symbol = gene_to_name .get (gene_id , gene_id )
719+ # CellRanger format: GENE_ID\tGENE_SYMBOL (or FEATURE_ID\tFEATURE_NAME for features)
720+ out .write (f'{ gene_id } \t { gene_symbol } \n ' )
710721
711722 return {
712723 'mtx' : cr_matrix_path ,
@@ -939,6 +950,7 @@ def filter_with_bustools(
939950 h5ad : bool = False ,
940951 by_name : bool = False ,
941952 cellranger : bool = False ,
953+ gzip : bool = False ,
942954 umi_gene : bool = True ,
943955 em : bool = False ,
944956) -> Dict [str , str ]:
@@ -1055,7 +1067,8 @@ def filter_with_bustools(
10551067 cr_result = matrix_to_cellranger (
10561068 count_result ['mtx' ], count_result ['barcodes' ],
10571069 count_result ['genes' ], t2g_path ,
1058- os .path .join (counts_dir , CELLRANGER_DIR )
1070+ os .path .join (counts_dir , CELLRANGER_DIR ),
1071+ gzip = gzip
10591072 )
10601073 results .update ({'cellranger' : cr_result })
10611074 else :
@@ -1210,6 +1223,8 @@ def count(
12101223 h5ad : bool = False ,
12111224 by_name : bool = False ,
12121225 cellranger : bool = False ,
1226+ gzip : bool = False ,
1227+ delete_bus : bool = False ,
12131228 inspect : bool = True ,
12141229 report : bool = False ,
12151230 fragment_l : Optional [int ] = None ,
@@ -1275,6 +1290,10 @@ def count(
12751290 by_name: Aggregate counts by name instead of ID.
12761291 cellranger: Whether to convert the final count matrix into a
12771292 cellranger-compatible matrix, defaults to `False`
1293+ gzip: Whether to gzip compress cellranger output matrices,
1294+ defaults to `False`
1295+ delete_bus: Whether to delete intermediate BUS files after successful count,
1296+ defaults to `False`
12781297 inspect: Whether or not to inspect the output BUS file and generate
12791298 the inspect.json
12801299 report: Generate an HTMl report, defaults to `False`
@@ -1632,7 +1651,8 @@ def update_results_with_suffix(current_results, new_results, suffix):
16321651 cr_result = matrix_to_cellranger (
16331652 count_result ['mtx' ], count_result ['barcodes' ],
16341653 count_result ['genes' ], t2g_path ,
1635- os .path .join (counts_dir , f'{ CELLRANGER_DIR } { suffix } ' )
1654+ os .path .join (counts_dir , f'{ CELLRANGER_DIR } { suffix } ' ),
1655+ gzip = gzip
16361656 )
16371657 update_results_with_suffix (
16381658 unfiltered_results , {'cellranger' : cr_result }, suffix
@@ -1704,6 +1724,7 @@ def update_results_with_suffix(current_results, new_results, suffix):
17041724 loom_names = loom_names ,
17051725 h5ad = h5ad ,
17061726 by_name = by_name ,
1727+ gzip = gzip ,
17071728 umi_gene = umi_gene ,
17081729 em = em ,
17091730 )
@@ -1735,6 +1756,34 @@ def update_results_with_suffix(current_results, new_results, suffix):
17351756 )
17361757 unfiltered_results .update (report_result )
17371758
1759+ # Delete intermediate BUS files if requested
1760+ if delete_bus :
1761+ logger .info ('Deleting intermediate BUS files to save disk space' )
1762+ bus_files_to_delete = []
1763+
1764+ # Collect all .bus files from results
1765+ if 'bus' in unfiltered_results :
1766+ bus_files_to_delete .append (unfiltered_results ['bus' ])
1767+ if 'bus_scs' in unfiltered_results :
1768+ bus_files_to_delete .append (unfiltered_results ['bus_scs' ])
1769+
1770+ # For smartseq3, delete suffix versions too
1771+ for suffix in ['' , INTERNAL_SUFFIX , UMI_SUFFIX ]:
1772+ if f'bus{ suffix } ' in unfiltered_results :
1773+ bus_files_to_delete .append (unfiltered_results [f'bus{ suffix } ' ])
1774+ if f'bus_scs{ suffix } ' in unfiltered_results :
1775+ bus_files_to_delete .append (unfiltered_results [f'bus_scs{ suffix } ' ])
1776+
1777+ # Delete filtered bus if exists
1778+ if 'filtered' in results and 'bus_scs' in results ['filtered' ]:
1779+ bus_files_to_delete .append (results ['filtered' ]['bus_scs' ])
1780+
1781+ # Delete each BUS file
1782+ for bus_file in bus_files_to_delete :
1783+ if bus_file and os .path .exists (bus_file ):
1784+ logger .debug (f'Deleting { bus_file } ' )
1785+ os .remove (bus_file )
1786+
17381787 return results
17391788
17401789
@@ -1762,6 +1811,9 @@ def count_nac(
17621811 h5ad : bool = False ,
17631812 by_name : bool = False ,
17641813 cellranger : bool = False ,
1814+ gzip : bool = False ,
1815+ cellranger_style : bool = False ,
1816+ delete_bus : bool = False ,
17651817 inspect : bool = True ,
17661818 report : bool = False ,
17671819 nucleus : bool = False ,
@@ -1823,6 +1875,12 @@ def count_nac(
18231875 by_name: Aggregate counts by name instead of ID.
18241876 cellranger: Whether to convert the final count matrix into a
18251877 cellranger-compatible matrix, defaults to `False`
1878+ gzip: Whether to gzip compress cellranger output matrices,
1879+ defaults to `False`
1880+ cellranger_style: Whether to organize output in CellRanger-style directories
1881+ (spliced/ and unspliced/ subdirectories), defaults to `False`
1882+ delete_bus: Whether to delete intermediate BUS files after successful count,
1883+ defaults to `False`
18261884 inspect: Whether or not to inspect the output BUS file and generate
18271885 the inspect.json
18281886 report: Generate HTML reports, defaults to `False`
@@ -2115,12 +2173,23 @@ def update_results_with_suffix(current_results, new_results, suffix):
21152173 prefix_results , count_result [i ], suffix
21162174 )
21172175 if cellranger :
2176+ # Determine output directory based on cellranger_style
2177+ if cellranger_style :
2178+ # Create spliced/unspliced subdirectories for CellRanger style
2179+ if i == 0 : # processed/spliced
2180+ cr_dir = os .path .join (counts_dir , 'spliced' )
2181+ elif i == 1 : # unprocessed/unspliced
2182+ cr_dir = os .path .join (counts_dir , 'unspliced' )
2183+ else : # ambiguous
2184+ cr_dir = os .path .join (counts_dir , f'{ CELLRANGER_DIR } _{ prefix } { suffix } ' )
2185+ else :
2186+ cr_dir = os .path .join (counts_dir , f'{ CELLRANGER_DIR } _{ prefix } { suffix } ' )
2187+
21182188 cr_result = matrix_to_cellranger (
21192189 count_result [i ]['mtx' ], count_result [i ]['barcodes' ],
21202190 count_result [i ]['genes' ], t2g_path ,
2121- os .path .join (
2122- counts_dir , f'{ CELLRANGER_DIR } _{ prefix } { suffix } '
2123- )
2191+ cr_dir ,
2192+ gzip = gzip
21242193 )
21252194 update_results_with_suffix (
21262195 prefix_results , {'cellranger' : cr_result }, suffix
@@ -2159,7 +2228,8 @@ def update_results_with_suffix(current_results, new_results, suffix):
21592228 res ['mtx' ], res ['barcodes' ], res ['genes' ], t2g_path ,
21602229 os .path .join (
21612230 counts_dir , f'{ CELLRANGER_DIR } _{ prefix } { suffix } '
2162- )
2231+ ),
2232+ gzip = gzip
21632233 )
21642234 update_results_with_suffix (
21652235 prefix_results , {'cellranger' : cr_result }, suffix
@@ -2278,12 +2348,23 @@ def update_results_with_suffix(current_results, new_results, suffix):
22782348 )
22792349 })
22802350 if cellranger :
2351+ # Determine output directory based on cellranger_style
2352+ if cellranger_style :
2353+ # Create spliced/unspliced subdirectories for CellRanger style
2354+ if i == 0 : # processed/spliced
2355+ cr_dir = os .path .join (filtered_counts_dir , 'spliced' )
2356+ elif i == 1 : # unprocessed/unspliced
2357+ cr_dir = os .path .join (filtered_counts_dir , 'unspliced' )
2358+ else : # ambiguous
2359+ cr_dir = os .path .join (filtered_counts_dir , f'{ CELLRANGER_DIR } _{ prefix } ' )
2360+ else :
2361+ cr_dir = os .path .join (filtered_counts_dir , f'{ CELLRANGER_DIR } _{ prefix } ' )
2362+
22812363 cr_result = matrix_to_cellranger (
22822364 count_result [i ]['mtx' ], count_result [i ]['barcodes' ],
22832365 count_result [i ]['genes' ], t2g_path ,
2284- os .path .join (
2285- filtered_counts_dir , f'{ CELLRANGER_DIR } _{ prefix } '
2286- )
2366+ cr_dir ,
2367+ gzip = gzip
22872368 )
22882369 filtered_results [prefix ].update ({'cellranger' : cr_result })
22892370 filtered_results [prefix ].update (count_result [i ])
@@ -2319,7 +2400,8 @@ def update_results_with_suffix(current_results, new_results, suffix):
23192400 os .path .join (
23202401 filtered_counts_dir ,
23212402 f'{ CELLRANGER_DIR } _{ prefix } '
2322- )
2403+ ),
2404+ gzip = gzip
23232405 )
23242406 filtered_results [prefix ].update ({
23252407 'cellranger' : cr_result
@@ -2402,6 +2484,29 @@ def update_results_with_suffix(current_results, new_results, suffix):
24022484 'Plots for TCC matrices have not yet been implemented. The HTML report will not contain any plots.'
24032485 )
24042486
2487+ # Delete intermediate BUS files if requested
2488+ if delete_bus :
2489+ logger .info ('Deleting intermediate BUS files to save disk space' )
2490+ bus_files_to_delete = []
2491+
2492+ # Collect all .bus files from results
2493+ prefixes = ['processed' , 'unprocessed' , 'ambiguous' ]
2494+ for prefix in prefixes :
2495+ if prefix in unfiltered_results :
2496+ for suffix in ['' , INTERNAL_SUFFIX , UMI_SUFFIX ]:
2497+ if f'bus{ suffix } ' in unfiltered_results [prefix ]:
2498+ bus_files_to_delete .append (unfiltered_results [prefix ][f'bus{ suffix } ' ])
2499+
2500+ # Delete filtered bus files if they exist
2501+ if 'filtered' in results and 'bus_scs' in results ['filtered' ]:
2502+ bus_files_to_delete .append (results ['filtered' ]['bus_scs' ])
2503+
2504+ # Delete each BUS file
2505+ for bus_file in bus_files_to_delete :
2506+ if bus_file and os .path .exists (bus_file ):
2507+ logger .debug (f'Deleting { bus_file } ' )
2508+ os .remove (bus_file )
2509+
24052510 return results
24062511
24072512
0 commit comments