nf-core · tylergross97 · May 13, 2025 · May 14, 2025 · May 14, 2025 · May 14, 2025
diff --git a/.editorconfig b/.editorconfig
@@ -0,0 +1,22 @@
+root = true
+
+[*]
+charset = utf-8
+end_of_line = lf
+indent_style = space
+insert_final_newline = true
+trim_trailing_whitespace = true
+max_line_length = 120
+
+[*.{md,yml,yaml,html,css,scss,js}]
+indent_size = 2
+
+[*.{py,java,r,R}]
+indent_size = 4
+
+[*.go]
+indent_style = tab
+
+[*.{sh,bash}]
+indent_size = 2
+
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -0,0 +1,35 @@
+name: nf-core linting
+# This workflow is triggered on pushes and PRs to the repository.
+# It runs the `nf-core lint` and markdown lint tests to ensure
+# that the code meets the nf-core guidelines.
+on:
+  push:
+    branches:
+      - gwas
+  pull_request:
+    branches:
+      - gwas
+  release:
+    types: [published]
+
+jobs:
+  Prettier:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+
+      - name: Install Prettier
+        run: npm install -g prettier
+
+      - name: Run Prettier --check
+        run: prettier --check .
+
+  PythonBlack:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Check code lints with Black
+        uses: psf/black@stable
diff --git a/.github/workflows/nf-test.yml b/.github/workflows/nf-test.yml
@@ -0,0 +1,61 @@
+name: nf-test
+
+on:
+  push:
+    branches:
+      - gwas
+  pull_request:
+    branches:
+      - gwas
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        NXF_VER:
+          - "25.04.6"
+        TEST_FILE:
+          - tests/modules/generate_example_genotypes_vcfs.nf.test
+          - tests/modules/concat_chunked_vcfs.nf.test
+          - tests/modules/generate_pheno_cov.nf.test
+          - tests/modules/extract_sample_ids.nf.test
+          - tests/modules/chunk_vcfs.nf.test
+          - tests/modules/index_chunked_vcfs.nf.test
+          - tests/main.nf.test
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@v4
+
+      - name: Install Nextflow
+        uses: nf-core/setup-nextflow@v2
+        with:
+          version: "${{ matrix.NXF_VER }}"
+
+      - name: Install nf-test
+        uses: nf-core/setup-nf-test@v1
+
+      - name: Run nf-test
+        run: |
+          mkdir -p reports
+          nf-test test ${{ matrix.TEST_FILE }} --profile test --verbose --tap reports/test-results-${{ strategy.job-index }}.tap
+
+      - name: List files for debugging
+        run: ls -l .
+
+      - name: Upload TAP reports
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: tap-reports-${{ matrix.NXF_VER }}-${{ strategy.job-index }}
+          path: |
+            reports/*.tap
+          if-no-files-found: warn
+
+      - name: Upload test results
+        if: always()
+        uses: actions/upload-artifact@v4
+        with:
+          name: nf-test-results-${{ matrix.NXF_VER }}
+          path: |
+            .nf-test.log
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,24 @@
+# Nextflow logs and metadata
+*.nextflow.log*
+/.nextflow/
+.nextflow/**
+
+# SLURM or scheduler logs
+*.out
+*.err
+*.out
+*.err
+
+# Shell scripts (optional, if not versioning run.sh)
+*.sh
+
+# Work and results/vcfs
+/work/
+/results/vcfs/
+.nf-test/tests/**
+.nf-test*
+
+# Nextflow temporary execution files
+*.command.*
+*.Rout
+*.tmp
diff --git a/.nf-core.yml b/.nf-core.yml
@@ -0,0 +1,8 @@
+repository_type: pipeline
+nf_core_version: "3.3.1"
+
+lint:
+  # Add any specific lint configurations here
+  # For example, to skip certain tests:
+  # actions_awsfulltest: false
+  # pipeline_todos: false
diff --git a/.prettireignore b/.prettireignore
@@ -0,0 +1,29 @@
+*.{png,jpg,jpeg,gif,svg,ico}
+*.{woff,woff2,eot,ttf,otf}
+*.{mp4,webm,ogg,mp3,wav,flac,aac}
+*.{zip,tar,gz,rar,7z}
+work/
+.nextflow*
+*.log
+.git/
+.gitignore
+testing*
+*.fa
+*.fasta
+*.fastq
+*.fastq.gz
+*.fq.gz
+*.bam
+*.sam
+*.vcf
+*.bcf
+*.txt
+*.tsv
+*.csv
+*.bed
+*.gtf
+*.gff
+*.wig
+*.bigwig
+*.bedgraph
+results/
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,9 @@
+{
+  "editor.formatOnSave": true,
+  "editor.defaultFormatter": "esbenp.prettier-vscode",
+  "python.formatting.provider": "black",
+  "python.formatting.blackArgs": ["--line-length=120"],
+  "editor.codeActionsOnSave": {
+    "source.fixAll": "explicit"
+  }
+}
diff --git a/README.md b/README.md
@@ -1,13 +1,27 @@
 # ![nfcore/test-datasets](docs/images/test-datasets_logo.png)
+
 Test data to be used for automated testing with the nf-core pipelines
 
 ## Introduction
 
 This is the gwas example-data branch, part of the nf-core collection of high quality Nextflow pipelines.
 
+## Workflow DAG
+
+Below is a diagram of the workflow steps as a Directed Acyclic Graph (DAG):
+
+```mermaid
+graph TD
+    A[GENERATE_EXAMPLE_GENOTYPES_VCFS] --> B[CHUNK_VCFS]
+    B --> C[INDEX_CHUNKED_VCFS]
+    B --> D[CONCAT_CHUNKED_VCFS]
+    B --> F[EXTRACT_SAMPLE_IDS]
+    F --> G[GENERATE_PHENO_COV]
+```
+
 ## Git clone the gwas pipeline test data
 
-If you want to get a local copy of the test data, you can either git clone the whole test data material, including all test data for all nf-core pipelnies, or if you want to save storage space you can clone the example data for one specific pipeline. 
+If you want to get a local copy of the test data, you can either git clone the whole test data material, including all test data for all nf-core pipelnies, or if you want to save storage space you can clone the example data for one specific pipeline.
 
 The data in this example-data branch is the same as the gwas pipeline uses for testing. It is accessed simply by cloning the branch either directly from nf-core if you just want to access the data, or if you want to update the data and make pull-request, it is suggested that you first fork the repository and then clone from your personal fork.
 
@@ -23,28 +37,34 @@ git clone -b gwas --single-branch [email protected]:USERNAME/test-datasets.git
 
 ## Documentation
 
-nf-core/test-datasets comes with documentation in the `docs/` directory and scripts to generate the example data in the `scripts/` directory.
+This test data comes from the 1000 Genomes Project phase3 release of variant calls. VCF files have been 'chunked' to include only the first 4,500 variants to reduce file sizes. Chromosome Y is excluded. Please see the datasets [README](https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/README_phase3_callset_20150220) for more details. Covariates and phenotypes were randomly generated for each sample in the VCF.
+
+nf-core/test-datasets comes with documentation in the `docs/` directory and the data can be generated running main.nf.
 
 ## Example data organisation
-nf-core/test-datasets generated test data is located in the `data/` directory.
+
+nf-core/test-datasets generated test data is located in the `results/` directory and includes the following structure.
 
 ```
-.
-├── data_phenotypes_and_covariates
-│   ├── example1.covar
-│   └── example1.pheno
-├── data_shrink_chunk_4500
-│   ├── chr10.vcf.bgz
-│   ├── chr10.vcf.bgz.tbi
-│   ├── chr11.vcf.bgz
-│   ├── chr11.vcf.bgz.tbi
-│
-└── data_shrink_combined_4500
-    ├── chr1_to_22_and_X.vcf.bgz
-    └── chr1_to_22_and_X.vcf.bgz.tbi
+results/
+├── chunked_vcfs/
+│   ├── chr1_chunked.vcf.gz
+│   ├── chr1_chunked.vcf.gz.tbi
+│   ├── chr2_chunked.vcf.gz
+│   ├── chr2_chunked.vcf.gz.tbi
+│   ├── ...
+│   ├── chrX_chunked.vcf.gz
+│   ├── chrX_chunked.vcf.gz.tbi
+│   ├── combined_chunked.vcf.gz
+│   └── combined_chunked.vcf.gz.tbi
+├── pheno_cov/
+│   ├── example.pheno
+│   └── example.covar
+
 ```
 
+Each chromosome-specific VCF file (chr\*.vcf.gz) is accompanied by its corresponding tabix index (.vcf.gz.tbi), enabling efficient querying. A combined VCF and index are also included for downstream association tests or visualization.
+
 ## Support
 
 For further information or help, don't hesitate to get in touch on our [Slack organisation](https://nf-co.re/join/slack) (a tool for instant messaging).
-
diff --git a/data/data_shrink_chunk_4500/chr1.vcf.bgz b/data/data_shrink_chunk_4500/chr1.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr1.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr1.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr10.vcf.bgz b/data/data_shrink_chunk_4500/chr10.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr10.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr10.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr11.vcf.bgz b/data/data_shrink_chunk_4500/chr11.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr11.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr11.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr12.vcf.bgz b/data/data_shrink_chunk_4500/chr12.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr12.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr12.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr13.vcf.bgz b/data/data_shrink_chunk_4500/chr13.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr13.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr13.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr14.vcf.bgz b/data/data_shrink_chunk_4500/chr14.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr14.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr14.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr15.vcf.bgz b/data/data_shrink_chunk_4500/chr15.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr15.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr15.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr16.vcf.bgz b/data/data_shrink_chunk_4500/chr16.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr16.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr16.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr17.vcf.bgz b/data/data_shrink_chunk_4500/chr17.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr17.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr17.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr18.vcf.bgz b/data/data_shrink_chunk_4500/chr18.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr18.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr18.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr19.vcf.bgz b/data/data_shrink_chunk_4500/chr19.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr19.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr19.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr2.vcf.bgz b/data/data_shrink_chunk_4500/chr2.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr2.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr2.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr20.vcf.bgz b/data/data_shrink_chunk_4500/chr20.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr20.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr20.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr21.vcf.bgz b/data/data_shrink_chunk_4500/chr21.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr21.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr21.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr22.vcf.bgz b/data/data_shrink_chunk_4500/chr22.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr22.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr22.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr3.vcf.bgz b/data/data_shrink_chunk_4500/chr3.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr3.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr3.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr4.vcf.bgz b/data/data_shrink_chunk_4500/chr4.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr4.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr4.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr5.vcf.bgz b/data/data_shrink_chunk_4500/chr5.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr5.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr5.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr6.vcf.bgz b/data/data_shrink_chunk_4500/chr6.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr6.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr6.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr7.vcf.bgz b/data/data_shrink_chunk_4500/chr7.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr7.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr7.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr8.vcf.bgz b/data/data_shrink_chunk_4500/chr8.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr8.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr8.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chr9.vcf.bgz b/data/data_shrink_chunk_4500/chr9.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chr9.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chr9.vcf.bgz.tbi
diff --git a/data/data_shrink_chunk_4500/chrX.vcf.bgz b/data/data_shrink_chunk_4500/chrX.vcf.bgz
diff --git a/data/data_shrink_chunk_4500/chrX.vcf.bgz.tbi b/data/data_shrink_chunk_4500/chrX.vcf.bgz.tbi
diff --git a/data/data_shrink_combined_4500/chr1_to_22_and_X.vcf.bgz.tbi b/data/data_shrink_combined_4500/chr1_to_22_and_X.vcf.bgz.tbi
diff --git a/docs/ADD_NEW_DATA.md b/docs/ADD_NEW_DATA.md
@@ -2,11 +2,11 @@
 
 Please fill in the appropriate checklist below (delete whatever is not relevant). These are the most common things requested when adding a new test dataset.
 
- - [ ] Check [here](https://github.com/nf-core/test-datasets/branches/all) that there isn't already a branch containing data that could be used
-   - If this is the case, follow the [documentation on how to use an existing test dataset](https://github.com/nf-core/test-datasets/blob/master/docs/USE_EXISTING_DATA.md)
- - [ ] Fork the [nf-core/test-datasets repository](https://github.com/nf-core/test-datasets) to your GitHub account
- - [ ] Create a new branch on your fork
- - [ ] Add your test dataset
- - [ ] Make a PR on a new branch with a relevant name
- - [ ] Wait for the PR to be merged
- - [ ] Use this newly created branch for your tests
+- [ ] Check [here](https://github.com/nf-core/test-datasets/branches/all) that there isn't already a branch containing data that could be used
+  - If this is the case, follow the [documentation on how to use an existing test dataset](https://github.com/nf-core/test-datasets/blob/master/docs/USE_EXISTING_DATA.md)
+- [ ] Fork the [nf-core/test-datasets repository](https://github.com/nf-core/test-datasets) to your GitHub account
+- [ ] Create a new branch on your fork
+- [ ] Add your test dataset
+- [ ] Make a PR on a new branch with a relevant name
+- [ ] Wait for the PR to be merged
+- [ ] Use this newly created branch for your tests
diff --git a/docs/USE_EXISTING_DATA.md b/docs/USE_EXISTING_DATA.md
@@ -2,6 +2,6 @@
 
 Please fill in the appropriate checklist below (delete whatever is not relevant). These are the most common things requested when adding a new test dataset.
 
- - [ ] Check [here](https://github.com/nf-core/test-datasets/branches/all) to find the branch corresponding to the test dataset you want to use
- - [ ] Specify in the test.config the path to the files from the test dataset
- - [ ] Set up your CI tests following the nf-core best practices (cf [.travis.yml template](https://github.com/nf-core/tools/blob/dev/nf_core/pipeline-template/{{cookiecutter.name_noslash}}/.travis.yml))
+- [ ] Check [here](https://github.com/nf-core/test-datasets/branches/all) to find the branch corresponding to the test dataset you want to use
+- [ ] Specify in the test.config the path to the files from the test dataset
+- [ ] Set up your CI tests following the nf-core best practices (cf [.travis.yml template](https://github.com/nf-core/tools/blob/dev/nf_core/pipeline-template/{{cookiecutter.name_noslash}}/.travis.yml))
diff --git a/docs/images/test-datasets_logo 2.png b/docs/images/test-datasets_logo 2.png
diff --git a/main.nf b/main.nf
@@ -0,0 +1,29 @@
+#!/usr/bin/env nextflow
+nextflow.enable.dsl = 2
+
+include { GENERATE_EXAMPLE_GENOTYPES_VCFS } from './modules/generate_example_genotypes_vcfs.nf'
+include { CHUNK_VCFS } from './modules/chunk_vcfs.nf'
+include { CONCAT_CHUNKED_VCFS } from './modules/concat_chunked_vcfs.nf'
+include { EXTRACT_SAMPLE_IDS } from './modules/extract_sample_ids.nf'
+include { GENERATE_PHENO_COV } from './modules/generate_pheno_cov.nf'
+include { INDEX_CHUNKED_VCFS } from './modules/index_chunked_vcfs.nf'
+workflow {
+    // Run the download process
+    GENERATE_EXAMPLE_GENOTYPES_VCFS()
+
+    def vcfs_with_chr = GENERATE_EXAMPLE_GENOTYPES_VCFS.out.vcfs
+    	.flatten()
+    	.map { file ->
+        	def chr = file.name.toString().split("\\.")[1] // safer than `tokenize`
+        	tuple(chr, file)
+    	}
+
+    // Feed the tuples into the chunking process
+    CHUNK_VCFS(vcfs_with_chr)
+    chr1_chunked = CHUNK_VCFS.out.chunked_vcfs.filter { chr, file -> chr == 'chr1'}.map { chr, file -> file }
+    EXTRACT_SAMPLE_IDS(chr1_chunked)
+    GENERATE_PHENO_COV(EXTRACT_SAMPLE_IDS.out.sample_ids)
+    INDEX_CHUNKED_VCFS(CHUNK_VCFS.out.chunked_vcfs)
+    all_chunked_vcfs = CHUNK_VCFS.out.chunked_vcfs.map { chr, file -> file }.collect()
+    CONCAT_CHUNKED_VCFS(all_chunked_vcfs)
+}
diff --git a/modules/chunk_vcfs.nf b/modules/chunk_vcfs.nf
@@ -0,0 +1,18 @@
+process CHUNK_VCFS {
+	container "community.wave.seqera.io/library/bcftools_tabix_pip_tools:48085064a9189d8c"
+	publishDir params.outdir_chunked_vcfs, mode: 'copy'
+
+	input:
+	tuple val(chr), path(vcfs)
+
+	output:
+	tuple val(chr), path("${chr}_chunked.vcf.gz"), emit: chunked_vcfs
+
+	script:
+	"""
+    # Proper VCF chunking with bcftools
+    bcftools view -H ${vcfs} | head -n 4500 > variants.txt
+    bcftools view -h ${vcfs} > header.txt
+    cat header.txt variants.txt | bgzip > ${chr}_chunked.vcf.gz
+	"""
+}
diff --git a/modules/concat_chunked_vcfs.nf b/modules/concat_chunked_vcfs.nf
@@ -0,0 +1,17 @@
+process CONCAT_CHUNKED_VCFS {
+	container "community.wave.seqera.io/library/bcftools_tabix_pip_tools:48085064a9189d8c"
+	publishDir params.outdir_chunked_vcfs, mode: 'copy'
+
+	input:
+	path vcf_files
+
+	output:
+	path "combined_chunked.vcf.gz", emit: combined_vcf
+	path "combined_chunked.vcf.gz.tbi", emit: combined_vcf_tbi
+
+	script:
+	"""
+	bcftools concat -Oz -o combined_chunked.vcf.gz ${vcf_files.join(' ')}
+	tabix -p vcf combined_chunked.vcf.gz	
+	"""
+}
diff --git a/modules/extract_sample_ids.nf b/modules/extract_sample_ids.nf
@@ -0,0 +1,15 @@
+process EXTRACT_SAMPLE_IDS {
+	container "community.wave.seqera.io/library/r-base:4.4.3--1e564c44feffeaa0"
+        publishDir params.outdir_pheno_cov, mode: 'symlink'
+
+	input:
+	path vcf_file
+
+	output:
+	path "sample_ids.txt", emit: sample_ids
+
+	script:
+	"""
+    	zcat $vcf_file | grep '#CHROM' | cut -f10- | tr '\t' '\n' > sample_ids.txt
+	"""
+}