nf-core · famosab · Mar 25, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - [#2087](https://github.com/nf-core/sarek/pull/2087) - Add `bam` as output format for parabricks/fq2bam, add multi lane support
+- Add `xengsort` as an additional contamination removal option, including index build/reuse support and workflow integration
 
 ### Changed
 
@@ -36,8 +37,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Parameters
 
-| Params | status |
-| ------ | ------ |
+| Params                  | status |
+| ----------------------- | ------ |
+| `--xengsort_host_fasta` | added  |
+| `--xengsort_index`      | added  |
+| `--xengsort_nobjects`   | added  |
+| `--xengsort_kmersize`   | added  |
+| `--save_xengsort_reads` | added  |
 
 ### Developer section
 

@@ -44,7 +44,7 @@ Depending on the options and samples provided, the pipeline can currently perfor
 
 - Form consensus reads from UMI sequences (`fgbio`)
 - Sequencing quality control and trimming (enabled by `--trim_fastq`) (`FastQC`, `fastp`)
-- Contamination removal (`BBSplit`, enabled by `--tools bbsplit`)
+- Contamination removal (`BBSplit` or `xengsort`, enabled by `--tools bbsplit` or `--tools xengsort`)
 - Map Reads to Reference (`BWA-mem`, `BWA-mem2`, `dragmap` or `Sentieon BWA-mem`)
 - Process BAM file (`GATK MarkDuplicates`, `GATK BaseRecalibrator` and `GATK ApplyBQSR` or `Sentieon LocusCollector` and `Sentieon Dedup`)
 - _Experimental Feature_: Use GPU-accelerated parabricks implementation as alternative to "Map Reads to Reference" + "Process BAM file" (`--aligner parabricks`)

@@ -18,4 +18,25 @@ process {
             ]
         ]
     }
+
+    withName: 'XENGSORT_CLASSIFY' {
+        // Include read basename when FASTQ is split to avoid filename collisions across chunks.
+        ext.prefix = {
+            if (params.split_fastq && reads) {
+                def first_read_name = reads instanceof List ? reads[0].getName() : reads.getName()
+                "${first_read_name.tokenize('.')[0]}.${meta.id}"
+            } else {
+                "${meta.id}"
+            }
+        }
+
+        publishDir = [
+            [
+                path: { params.save_xengsort_reads ? "${params.outdir}/preprocessing/xengsort/${meta.id}" : params.outdir },
+                mode: params.publish_dir_mode,
+                pattern: '*.fq.gz',
+                saveAs: { params.save_xengsort_reads ? it : null }
+            ]
+        ]
+    }
 }
@@ -98,6 +98,16 @@ process {
         ]
     }
 
+    withName: 'XENGSORT_INDEX' {
+        ext.nobjects = { params.xengsort_nobjects }
+        ext.kmersize = { params.xengsort_kmersize }
+        publishDir = [
+            mode: params.publish_dir_mode,
+            path: { "${params.outdir}/reference" },
+            saveAs: { params.save_reference || params.build_only_index ? it : null }
+        ]
+    }
+
     withName: 'SAMTOOLS_FAIDX' {
         publishDir       = [
             mode: params.publish_dir_mode,

@@ -43,8 +43,10 @@ params {
     sentieon_dnascope_model    = "s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Annotation/Sentieon/SentieonDNAscopeModel1.1.model"
 
     // default params
-    split_fastq = 0         // no FASTQ splitting
-    tools       = 'strelka' // Variant calling with Strelka
+    split_fastq = 0            // no FASTQ splitting
+    tools       = 'strelka'    // Variant calling with Strelka
+    xengsort_nobjects = 400000 // xengsort index n objects
+    xengsort_kmersize = 25     // xengsort index k-mer size
 }
 
 process {
@@ -89,4 +91,8 @@ process {
             "--low-memory",
         ].join(' ').trim() }
     }
+
+    withName: '.*:XENGSORT_CLASSIFY' {
+        cpus = 1
+    }
 }
@@ -18,6 +18,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
     - [Split FastQ files](#split-fastq-files)
     - [UMI consensus](#umi-consensus)
     - [BBSplit contamination removal](#bbsplit-contamination-removal)
+    - [Xengsort contamination removal](#xengsort-contamination-removal)
   - [Map to Reference](#map-to-reference)
     - [BWA](#bwa)
     - [BWA-mem2](#bwa-mem2)
@@ -202,6 +203,29 @@ By default, the following parameters are used for BBSplit `ambiguous2=best maxin
 
 </details>
 
+#### Xengsort contamination removal
+
+[Xengsort](https://gitlab.com/genomeinformatics/xengsort) classifies xenograft reads against graft and host references and is useful for PDX-like samples where contaminating host reads should be excluded before alignment.
+
+To enable the tool, add `--tools xengsort`.
+
+- Reuse a pre-built index with `--xengsort_index`.
+- Build an index during the run by providing `--xengsort_host_fasta` together with `--xengsort_nobjects` and `--xengsort_kmersize`. `--xengsort_host_fasta` accepts either a single FASTA path or a glob pattern that matches one or more host FASTA files (for example `/path/to/host.fa` or `/path/to/host/*.fa`). Please refer to the [xengsort documentation](https://gitlab.com/genomeinformatics/xengsort) to determine appropriate values for `nobjects` and `kmersize`.
+
+By default, classified FastQ files are intermediate and not published to `outdir`. Set `--save_xengsort_reads` to publish them.
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `preprocessing/xengsort/`
+  - `*-graft.*fq.gz`: Reads assigned to graft reference (used for downstream alignment; e.g. Human).
+  - `*-host.*fq.gz`: Reads assigned to host reference (e.g. Mouse).
+  - `*-both.*fq.gz`: Reads matching both references.
+  - `*-neither.*fq.gz`: Reads matching neither reference.
+  - `*-ambiguous.*fq.gz`: Ambiguous assignments.
+
+</details>
+
 ### Map to Reference
 
 #### BWA
@@ -1449,4 +1473,6 @@ Contains reference folders generated by the pipeline. These files are only publi
   - [MSIsensorPro](https://github.com/xjtu-omics/msisensor-pro) scan of the reference genome to get microsatellites information
 - `pon/`
   - Tabix index generated by [Tabix](http://www.htslib.org/doc/tabix.html) from the given panel-of-normals file
+- `xengsort_index/`
+  - Xengsort index files generated from graft/host references (for example `*.hash` and `*.info`).
   </details>
@@ -63,6 +63,39 @@ genome: 'GATK.GRCh38'
 
 You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch).
 
+## Optional contamination removal with xengsort
+
+`nf-core/sarek` supports host/graft read disambiguation with [xengsort](https://gitlab.com/genomeinformatics/xengsort).
+Enable it by adding `xengsort` to `--tools`.
+
+When building an index during the run, provide:
+
+- `--xengsort_host_fasta`: host FASTA (path or glob)
+- `--xengsort_nobjects`: value for `xengsort index --nobjects`
+- `--xengsort_kmersize`: value for `xengsort index --kmersize`
+
+Alternatively, reuse an existing index with `--xengsort_index`.
+For details about the appropriate nobjects and kmersize to use,
+see https://gitlab.com/genomeinformatics/xengsort.
+
+When Sarek builds a xengsort index, it is only published to `outdir/reference` if
+`--save_reference` or `--build_only_index` is enabled.
+
+Example:
+
+```bash
+nextflow run nf-core/sarek \
+  -profile docker \
+  --input samplesheet.csv \
+  --outdir results \
+  --tools xengsort,strelka \
+  --xengsort_host_fasta /path/to/host_reference.fasta \
+  --xengsort_nobjects 400000 \
+  --xengsort_kmersize 25
+```
+
+Set `--save_xengsort_reads` to publish xengsort-classified FASTQ files.
+
 ## Input: Sample sheet configurations
 
 You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use the parameter `--input` to specify its location. It has to be a comma-separated file with at least 3 columns, and a header row as shown in the examples below.

@@ -125,6 +125,8 @@ workflow NFCORE_SAREK {
         params.step,
         params.tools ?: 'no_tools',
         params.vep_include_fasta,
+        params.xengsort_index,
+        params.xengsort_host_fasta,
     )
 
     // Build intervals if needed
@@ -285,6 +287,7 @@ workflow NFCORE_SAREK {
         PREPARE_GENOME.out.ascat_loci_gc,
         PREPARE_GENOME.out.ascat_loci_rt,
         PREPARE_GENOME.out.bbsplit_index,
+        PREPARE_GENOME.out.xengsort_index,
         PREPARE_GENOME.out.bcftools_annotations,
         PREPARE_GENOME.out.bcftools_annotations_tbi,
         params.bcftools_columns ? Channel.fromPath(params.bcftools_columns).collect() : Channel.value([]),

@@ -0,0 +1,7 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - "bioconda::xengsort=2.1.0"
@@ -0,0 +1,87 @@
+process XENGSORT_CLASSIFY {
+    tag "$meta.id"
+    label 'process_medium'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/xengsort:2.1.0--pyhdfd78af_1':
+        'community.wave.seqera.io/library/htslib_pip_xengsort:39f2d076bf655602'}"
+
+    input:
+    tuple val(meta), path(reads)
+    path  index_folder
+
+    output:
+    tuple val(meta), path('*-graft.*fq.gz')    , optional:true, emit: graft_fastq
+    tuple val(meta), path('*-host.*fq.gz')     , optional:true, emit: host_fastq
+    tuple val(meta), path('*-both.*fq.gz')     , optional:true, emit: both_fastq
+    tuple val(meta), path('*-neither.*fq.gz')  , optional:true, emit: neither_fastq
+    tuple val(meta), path('*-ambiguous.*fq.gz'), optional:true, emit: ambiguous_fastq
+    tuple val("${task.process}"), val('xengsort'), eval("xengsort --version"), topic: versions, emit: versions_xengsort_classify
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def mode = task.ext.mode ?: 'count'
+
+    def fastq_args = meta.single_end ? "--fastq ${reads}" : "--fastq ${reads[0]} --pairs ${reads[1]}"
+
+    """
+    # Check if the index folder contains the expected index files
+    HASH_COUNT=\$(ls "${index_folder}"/*.hash 2>/dev/null | wc -l || true)
+    INFO_COUNT=\$(ls "${index_folder}"/*.info 2>/dev/null | wc -l || true)
+
+    if [ "\$HASH_COUNT" -ne 1 ] || [ "\$INFO_COUNT" -ne 1 ]; then
+        echo "ERROR: The input index directory must contain exactly one .hash and one .info file." >&2
+        echo "Found \$HASH_COUNT .hash files and \$INFO_COUNT .info files in ${index_folder}." >&2
+        exit 1
+    fi
+
+    # Extract the index prefix (the .hash and .info have the same basename)
+    HASH_FILE=\$(ls "${index_folder}"/*.hash)
+    INDEX_PREFIX=\$(basename "\$HASH_FILE" .hash)
+
+    # Run xengsort classify
+    xengsort classify \\
+        --index "${index_folder}/\$INDEX_PREFIX" \\
+        $fastq_args \\
+        --mode $mode \\
+        --prefix $prefix \\
+        --threads ${task.cpus} \\
+        $args
+    """
+
+    stub:
+    def args = task.ext.args ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+
+    if (meta.single_end) {
+        """
+        echo $args
+
+        echo '' | gzip > ${prefix}-graft.fq.gz
+        echo '' | gzip > ${prefix}-host.fq.gz
+        echo '' | gzip > ${prefix}-both.fq.gz
+        echo '' | gzip > ${prefix}-neither.fq.gz
+        echo '' | gzip > ${prefix}-ambiguous.fq.gz
+        """
+    } else {
+        """
+        echo $args
+
+        echo '' | gzip > ${prefix}-graft.1.fq.gz
+        echo '' | gzip > ${prefix}-graft.2.fq.gz
+        echo '' | gzip > ${prefix}-host.1.fq.gz
+        echo '' | gzip > ${prefix}-host.2.fq.gz
+        echo '' | gzip > ${prefix}-both.1.fq.gz
+        echo '' | gzip > ${prefix}-both.2.fq.gz
+        echo '' | gzip > ${prefix}-neither.1.fq.gz
+        echo '' | gzip > ${prefix}-neither.2.fq.gz
+        echo '' | gzip > ${prefix}-ambiguous.1.fq.gz
+        echo '' | gzip > ${prefix}-ambiguous.2.fq.gz
+        """
+    }
+}