Skip to content

Nextflow pipeline to generate data for nf-core/test-datasets gwas branch #1610

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 27 commits into
base: gwas
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
1b2ad2a
Started converting a nextflow pipeline - downloading full vcf files f…
tylergross97 May 13, 2025
a374fbe
Working version
May 14, 2025
70d51fe
Removed old data/ and scripts/
May 14, 2025
8780fe5
indexed chunked vcfs
May 14, 2025
82cf061
Updated README
May 14, 2025
e9981f9
Updated README again
May 14, 2025
70b95c0
pushing to new remote branch so i can migrate from SSH to local dev
Jun 30, 2025
4f47e55
Updated .gitignore
Jun 30, 2025
f95a4de
Remove untracked files
Jun 30, 2025
41a2f1e
module tests all pass
tylergross97 Jul 1, 2025
484264f
Removed extra documentation - ADD_NEW_DATA.md and social_preview_2.png
tylergross97 Jul 2, 2025
bbd218b
Passed all tests and set up .github/workflows/nf-test.yml
tylergross97 Jul 2, 2025
e8f0af7
passed all tests
tylergross97 Jul 2, 2025
34332c3
Updated nf-test.yml to run on gwas branch push, not main or dev
tylergross97 Jul 2, 2025
fdb7e6e
Updated nextflow.config to include a test profile so docker can run
tylergross97 Jul 2, 2025
c6eadc5
Improved test name for main.nf.test
tylergross97 Jul 2, 2025
edc6889
Allowed for sharding and add reports for profile test
tylergross97 Jul 2, 2025
7fe0190
Removed edge veresion
tylergross97 Jul 2, 2025
d7c3f69
Add test in generate_example_genotypes_vcfs.nf.test that checks url a…
tylergross97 Jul 3, 2025
544d8a7
fixed linting.yml
tylergross97 Jul 3, 2025
86189a0
Added hpc profile alongside test
tylergross97 Jul 6, 2025
3ec8f6c
added mermaid DAG to README.md
tylergross97 Jul 7, 2025
cf81146
Updated mermaid plot
tylergross97 Jul 7, 2025
219f7de
Removed .nf-test/reports upload test results step in nf-test.yml
tylergross97 Jul 7, 2025
e39108b
added bebugging to .nf-test.log upload
tylergross97 Jul 7, 2025
29f9ff8
added --verbose flag to nf-test step in nf-test.yml
tylergross97 Jul 7, 2025
390f676
added tap reports
tylergross97 Jul 7, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
root = true

[*]
charset = utf-8
end_of_line = lf
indent_style = space
insert_final_newline = true
trim_trailing_whitespace = true
max_line_length = 120

[*.{md,yml,yaml,html,css,scss,js}]
indent_size = 2

[*.{py,java,r,R}]
indent_size = 4

[*.go]
indent_style = tab

[*.{sh,bash}]
indent_size = 2

35 changes: 35 additions & 0 deletions .github/workflows/linting.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: nf-core linting
# This workflow is triggered on pushes and PRs to the repository.
# It runs the `nf-core lint` and markdown lint tests to ensure
# that the code meets the nf-core guidelines.
on:
push:
branches:
- gwas
pull_request:
branches:
- gwas
release:
types: [published]

jobs:
Prettier:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- uses: actions/setup-node@v4

- name: Install Prettier
run: npm install -g prettier

- name: Run Prettier --check
run: prettier --check .

PythonBlack:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Check code lints with Black
uses: psf/black@stable
61 changes: 61 additions & 0 deletions .github/workflows/nf-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: nf-test

on:
push:
branches:
- gwas
pull_request:
branches:
- gwas

jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
NXF_VER:
- "25.04.6"
TEST_FILE:
- tests/modules/generate_example_genotypes_vcfs.nf.test
- tests/modules/concat_chunked_vcfs.nf.test
- tests/modules/generate_pheno_cov.nf.test
- tests/modules/extract_sample_ids.nf.test
- tests/modules/chunk_vcfs.nf.test
- tests/modules/index_chunked_vcfs.nf.test
- tests/main.nf.test
steps:
- name: Check out pipeline code
uses: actions/checkout@v4

- name: Install Nextflow
uses: nf-core/setup-nextflow@v2
with:
version: "${{ matrix.NXF_VER }}"

- name: Install nf-test
uses: nf-core/setup-nf-test@v1

- name: Run nf-test
run: |
mkdir -p reports
nf-test test ${{ matrix.TEST_FILE }} --profile test --verbose --tap reports/test-results-${{ strategy.job-index }}.tap

- name: List files for debugging
run: ls -l .

- name: Upload TAP reports
if: always()
uses: actions/upload-artifact@v4
with:
name: tap-reports-${{ matrix.NXF_VER }}-${{ strategy.job-index }}
path: |
reports/*.tap
if-no-files-found: warn

- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: nf-test-results-${{ matrix.NXF_VER }}
path: |
.nf-test.log
24 changes: 24 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Nextflow logs and metadata
*.nextflow.log*
/.nextflow/
.nextflow/**

# SLURM or scheduler logs
*.out
*.err
*.out
*.err

# Shell scripts (optional, if not versioning run.sh)
*.sh

# Work and results/vcfs
/work/
/results/vcfs/
.nf-test/tests/**
.nf-test*

# Nextflow temporary execution files
*.command.*
*.Rout
*.tmp
8 changes: 8 additions & 0 deletions .nf-core.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
repository_type: pipeline
nf_core_version: "3.3.1"

lint:
# Add any specific lint configurations here
# For example, to skip certain tests:
# actions_awsfulltest: false
# pipeline_todos: false
29 changes: 29 additions & 0 deletions .prettireignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
*.{png,jpg,jpeg,gif,svg,ico}
*.{woff,woff2,eot,ttf,otf}
*.{mp4,webm,ogg,mp3,wav,flac,aac}
*.{zip,tar,gz,rar,7z}
work/
.nextflow*
*.log
.git/
.gitignore
testing*
*.fa
*.fasta
*.fastq
*.fastq.gz
*.fq.gz
*.bam
*.sam
*.vcf
*.bcf
*.txt
*.tsv
*.csv
*.bed
*.gtf
*.gff
*.wig
*.bigwig
*.bedgraph
results/
9 changes: 9 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
{
"editor.formatOnSave": true,
"editor.defaultFormatter": "esbenp.prettier-vscode",
"python.formatting.provider": "black",
"python.formatting.blackArgs": ["--line-length=120"],
"editor.codeActionsOnSave": {
"source.fixAll": "explicit"
}
}
54 changes: 37 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,27 @@
# ![nfcore/test-datasets](docs/images/test-datasets_logo.png)

Test data to be used for automated testing with the nf-core pipelines

## Introduction

This is the gwas example-data branch, part of the nf-core collection of high quality Nextflow pipelines.

## Workflow DAG

Below is a diagram of the workflow steps as a Directed Acyclic Graph (DAG):

```mermaid
graph TD
A[GENERATE_EXAMPLE_GENOTYPES_VCFS] --> B[CHUNK_VCFS]
B --> C[INDEX_CHUNKED_VCFS]
B --> D[CONCAT_CHUNKED_VCFS]
B --> F[EXTRACT_SAMPLE_IDS]
F --> G[GENERATE_PHENO_COV]
```

## Git clone the gwas pipeline test data

If you want to get a local copy of the test data, you can either git clone the whole test data material, including all test data for all nf-core pipelnies, or if you want to save storage space you can clone the example data for one specific pipeline.
If you want to get a local copy of the test data, you can either git clone the whole test data material, including all test data for all nf-core pipelnies, or if you want to save storage space you can clone the example data for one specific pipeline.

The data in this example-data branch is the same as the gwas pipeline uses for testing. It is accessed simply by cloning the branch either directly from nf-core if you just want to access the data, or if you want to update the data and make pull-request, it is suggested that you first fork the repository and then clone from your personal fork.

Expand All @@ -23,28 +37,34 @@ git clone -b gwas --single-branch [email protected]:USERNAME/test-datasets.git

## Documentation

nf-core/test-datasets comes with documentation in the `docs/` directory and scripts to generate the example data in the `scripts/` directory.
This test data comes from the 1000 Genomes Project phase3 release of variant calls. VCF files have been 'chunked' to include only the first 4,500 variants to reduce file sizes. Chromosome Y is excluded. Please see the datasets [README](https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/README_phase3_callset_20150220) for more details. Covariates and phenotypes were randomly generated for each sample in the VCF.

nf-core/test-datasets comes with documentation in the `docs/` directory and the data can be generated running main.nf.

## Example data organisation
nf-core/test-datasets generated test data is located in the `data/` directory.

nf-core/test-datasets generated test data is located in the `results/` directory and includes the following structure.

```
.
├── data_phenotypes_and_covariates
│   ├── example1.covar
│   └── example1.pheno
├── data_shrink_chunk_4500
│   ├── chr10.vcf.bgz
│   ├── chr10.vcf.bgz.tbi
│   ├── chr11.vcf.bgz
│   ├── chr11.vcf.bgz.tbi
└── data_shrink_combined_4500
├── chr1_to_22_and_X.vcf.bgz
└── chr1_to_22_and_X.vcf.bgz.tbi
results/
├── chunked_vcfs/
│   ├── chr1_chunked.vcf.gz
│   ├── chr1_chunked.vcf.gz.tbi
│   ├── chr2_chunked.vcf.gz
│   ├── chr2_chunked.vcf.gz.tbi
│   ├── ...
│   ├── chrX_chunked.vcf.gz
│   ├── chrX_chunked.vcf.gz.tbi
│   ├── combined_chunked.vcf.gz
│   └── combined_chunked.vcf.gz.tbi
├── pheno_cov/
│   ├── example.pheno
│   └── example.covar

```

Each chromosome-specific VCF file (chr\*.vcf.gz) is accompanied by its corresponding tabix index (.vcf.gz.tbi), enabling efficient querying. A combined VCF and index are also included for downstream association tests or visualization.

## Support

For further information or help, don't hesitate to get in touch on our [Slack organisation](https://nf-co.re/join/slack) (a tool for instant messaging).

Binary file removed data/data_shrink_chunk_4500/chr1.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr1.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr10.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr10.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr11.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr11.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr12.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr12.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr13.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr13.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr14.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr14.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr15.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr15.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr16.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr16.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr17.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr17.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr18.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr18.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr19.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr19.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr2.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr2.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr20.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr20.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr21.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr21.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr22.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr22.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr3.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr3.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr4.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr4.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr5.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr5.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr6.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr6.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr7.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr7.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr8.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr8.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr9.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chr9.vcf.bgz.tbi
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chrX.vcf.bgz
Binary file not shown.
Binary file removed data/data_shrink_chunk_4500/chrX.vcf.bgz.tbi
Binary file not shown.
Binary file not shown.
16 changes: 8 additions & 8 deletions docs/ADD_NEW_DATA.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@

Please fill in the appropriate checklist below (delete whatever is not relevant). These are the most common things requested when adding a new test dataset.

- [ ] Check [here](https://github.com/nf-core/test-datasets/branches/all) that there isn't already a branch containing data that could be used
- If this is the case, follow the [documentation on how to use an existing test dataset](https://github.com/nf-core/test-datasets/blob/master/docs/USE_EXISTING_DATA.md)
- [ ] Fork the [nf-core/test-datasets repository](https://github.com/nf-core/test-datasets) to your GitHub account
- [ ] Create a new branch on your fork
- [ ] Add your test dataset
- [ ] Make a PR on a new branch with a relevant name
- [ ] Wait for the PR to be merged
- [ ] Use this newly created branch for your tests
- [ ] Check [here](https://github.com/nf-core/test-datasets/branches/all) that there isn't already a branch containing data that could be used
- If this is the case, follow the [documentation on how to use an existing test dataset](https://github.com/nf-core/test-datasets/blob/master/docs/USE_EXISTING_DATA.md)
- [ ] Fork the [nf-core/test-datasets repository](https://github.com/nf-core/test-datasets) to your GitHub account
- [ ] Create a new branch on your fork
- [ ] Add your test dataset
- [ ] Make a PR on a new branch with a relevant name
- [ ] Wait for the PR to be merged
- [ ] Use this newly created branch for your tests
6 changes: 3 additions & 3 deletions docs/USE_EXISTING_DATA.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@

Please fill in the appropriate checklist below (delete whatever is not relevant). These are the most common things requested when adding a new test dataset.

- [ ] Check [here](https://github.com/nf-core/test-datasets/branches/all) to find the branch corresponding to the test dataset you want to use
- [ ] Specify in the test.config the path to the files from the test dataset
- [ ] Set up your CI tests following the nf-core best practices (cf [.travis.yml template](https://github.com/nf-core/tools/blob/dev/nf_core/pipeline-template/{{cookiecutter.name_noslash}}/.travis.yml))
- [ ] Check [here](https://github.com/nf-core/test-datasets/branches/all) to find the branch corresponding to the test dataset you want to use
- [ ] Specify in the test.config the path to the files from the test dataset
- [ ] Set up your CI tests following the nf-core best practices (cf [.travis.yml template](https://github.com/nf-core/tools/blob/dev/nf_core/pipeline-template/{{cookiecutter.name_noslash}}/.travis.yml))
Binary file added docs/images/test-datasets_logo 2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
29 changes: 29 additions & 0 deletions main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#!/usr/bin/env nextflow
nextflow.enable.dsl = 2

include { GENERATE_EXAMPLE_GENOTYPES_VCFS } from './modules/generate_example_genotypes_vcfs.nf'
include { CHUNK_VCFS } from './modules/chunk_vcfs.nf'
include { CONCAT_CHUNKED_VCFS } from './modules/concat_chunked_vcfs.nf'
include { EXTRACT_SAMPLE_IDS } from './modules/extract_sample_ids.nf'
include { GENERATE_PHENO_COV } from './modules/generate_pheno_cov.nf'
include { INDEX_CHUNKED_VCFS } from './modules/index_chunked_vcfs.nf'
workflow {
// Run the download process
GENERATE_EXAMPLE_GENOTYPES_VCFS()

def vcfs_with_chr = GENERATE_EXAMPLE_GENOTYPES_VCFS.out.vcfs
.flatten()
.map { file ->
def chr = file.name.toString().split("\\.")[1] // safer than `tokenize`
tuple(chr, file)
}

// Feed the tuples into the chunking process
CHUNK_VCFS(vcfs_with_chr)
chr1_chunked = CHUNK_VCFS.out.chunked_vcfs.filter { chr, file -> chr == 'chr1'}.map { chr, file -> file }
EXTRACT_SAMPLE_IDS(chr1_chunked)
GENERATE_PHENO_COV(EXTRACT_SAMPLE_IDS.out.sample_ids)
INDEX_CHUNKED_VCFS(CHUNK_VCFS.out.chunked_vcfs)
all_chunked_vcfs = CHUNK_VCFS.out.chunked_vcfs.map { chr, file -> file }.collect()
CONCAT_CHUNKED_VCFS(all_chunked_vcfs)
}
18 changes: 18 additions & 0 deletions modules/chunk_vcfs.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
process CHUNK_VCFS {
container "community.wave.seqera.io/library/bcftools_tabix_pip_tools:48085064a9189d8c"
publishDir params.outdir_chunked_vcfs, mode: 'copy'

input:
tuple val(chr), path(vcfs)

output:
tuple val(chr), path("${chr}_chunked.vcf.gz"), emit: chunked_vcfs

script:
"""
# Proper VCF chunking with bcftools
bcftools view -H ${vcfs} | head -n 4500 > variants.txt
bcftools view -h ${vcfs} > header.txt
cat header.txt variants.txt | bgzip > ${chr}_chunked.vcf.gz
"""
}
17 changes: 17 additions & 0 deletions modules/concat_chunked_vcfs.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
process CONCAT_CHUNKED_VCFS {
container "community.wave.seqera.io/library/bcftools_tabix_pip_tools:48085064a9189d8c"
publishDir params.outdir_chunked_vcfs, mode: 'copy'

input:
path vcf_files

output:
path "combined_chunked.vcf.gz", emit: combined_vcf
path "combined_chunked.vcf.gz.tbi", emit: combined_vcf_tbi

script:
"""
bcftools concat -Oz -o combined_chunked.vcf.gz ${vcf_files.join(' ')}
tabix -p vcf combined_chunked.vcf.gz
"""
}
15 changes: 15 additions & 0 deletions modules/extract_sample_ids.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
process EXTRACT_SAMPLE_IDS {
container "community.wave.seqera.io/library/r-base:4.4.3--1e564c44feffeaa0"
publishDir params.outdir_pheno_cov, mode: 'symlink'

input:
path vcf_file

output:
path "sample_ids.txt", emit: sample_ids

script:
"""
zcat $vcf_file | grep '#CHROM' | cut -f10- | tr '\t' '\n' > sample_ids.txt
"""
}
Loading