Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions CITATION.cff
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
cff-version: 1.2.0
title: atomworks
message: >-
If you use this software, please cite it using the
metadata from this file.
type: software
authors:
- family-names: Corley
given-names: Nathaniel
- family-names: Mathis
given-names: Simon
- family-names: Krishna
given-names: Rohith
- family-names: Bauer
given-names: Magnus S.
- family-names: Thompson
given-names: Tuscan R.
- family-names: Ahern
given-names: Woody
- family-names: Kazman
given-names: Maxwell W.
- family-names: Brent
given-names: Rafael I.
- family-names: Didi
given-names: Kieran
- family-names: Kubaney
given-names: Andrew
- family-names: McHugh
given-names: Liam
- family-names: Nagle
given-names: Andrew
- family-names: Favor
given-names: Adam
- family-names: Kshirsagar
given-names: Meghana
- family-names: Sturmfels
given-names: Pascal
- family-names: Li
given-names: Yinuo
- family-names: Butcher
given-names: John
- family-names: Qiang
given-names: Bo
- family-names: Schaaf
given-names: Luna L.
- family-names: Mitra
given-names: Ria
- family-names: Campbell
given-names: Kerrie
- family-names: Zhang
given-names: Opa
- family-names: Weissman
given-names: Rose
- family-names: Humphreys
given-names: Ian R.
- family-names: Cong
given-names: Qian
- family-names: Jiang
given-names: Hanlun
- family-names: Funk
given-names: Jason
- family-names: Sonthalia
given-names: Satyaki
- family-names: Lio
given-names: Pietro
- family-names: Baker
given-names: David
- family-names: DiMaio
given-names: Frank
identifiers:
- type: doi
value: 10.1101/2025.08.14.670328
description: bioRxiv preprint
repository-code: https://github.com/RosettaCommons/atomworks
url: https://rosettacommons.github.io/atomworks/latest/
abstract: >-
A research-oriented data toolkit for training biomolecular
deep-learning foundation models. AtomWorks provides tools
for parsing, cleaning, manipulating, and converting
biological data (structures, sequences, small molecules)
as well as advanced dataset featurization and sampling for
deep learning workflows.
keywords:
- bioinformatics
- machine-learning
- deep-learning
- protein-structure
- biotite
- structural-biology
license: BSD-3-Clause
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ We thank Hope Woods and Rachel Clune from the Rosetta Commons for their partners

If you make use of AtomWorks in your research, please cite:

> N. Corley\*, S. Mathis\*, R. Krishna\*, M. S. Bauer, T. R. Thompson, W. Ahern, M. W. Kazman, R. I. Brent, K. Didi, A. Kubaney, L. McHugh, A. Nagle, A. Favor, M. Kshirsagar, P. Sturmfels, Y. Li, J. Butcher, B. Qiang, L. L. Schaaf, R. Mitra, K. Campbell, O. Zhang, R. Weissman, I. R. Humphreys, Q. Cong, J. Funk, S. Sonthalia, P. Lio, D. Baker, F. DiMaio,
> N. Corley\*, S. Mathis\*, R. Krishna\*, M. S. Bauer, T. R. Thompson, W. Ahern, M. W. Kazman, R. I. Brent, K. Didi, A. Kubaney, L. McHugh, A. Nagle, A. Favor, M. Kshirsagar, P. Sturmfels, Y. Li, J. Butcher, B. Qiang, L. L. Schaaf, R. Mitra, K. Campbell, O. Zhang, R. Weissman, I. R. Humphreys, Q. Cong, H. Jiang, J. Funk, S. Sonthalia, P. Lio, D. Baker, F. DiMaio,
> "Accelerating Biomolecular Modeling with AtomWorks and RF3," bioRxiv, August 2025. doi: [10.1101/2025.08.14.670328](https://doi.org/10.1101/2025.08.14.670328)

If you use bibtex, here's the GoogleScholar formatted citation:
Expand Down
9 changes: 8 additions & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

sys.path.insert(0, os.path.abspath("../src"))


import atomworks

project = "atomworks"
Expand Down Expand Up @@ -44,11 +45,17 @@
"sphinx.ext.viewcode", # Add source code links
"sphinx.ext.napoleon", # Google/NumPy style docstrings
"sphinx_gallery.gen_gallery", # Generates auto_examples/ from examples/
#"sphinx_click",
"sphinxcontrib.typer"
]

templates_path = ["_templates"]
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "examples/GALLERY_HEADER.rst", "ml/preprocessing.rst"]
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "examples/GALLERY_HEADER.rst"]#, "ml/preprocessing.rst"]

#autodoc_mock_imports = [
# "zstandard",
# "torch",
#]
# -- Options for HTML output -------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output

Expand Down
3 changes: 2 additions & 1 deletion docs/docs_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ sphinx-autodoc-typehints>=1.20.0,<2
nbsphinx>=0.8.9,<1
sphinx-gallery>=0.8.1,<1
ghp-import>=2.0.0,<3
pandoc>=2.0.0,<3
pandoc>=2.0.0,<3
sphinxcontrib-typer>=0.7.2
2 changes: 1 addition & 1 deletion docs/examples/load_and_visualize_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
print("Available annotations:")
annotations = atom_array.get_annotation_categories()
for i, annotation in enumerate(annotations):
print(f" {i+1:2d}. {annotation}")
print(f" {i + 1:2d}. {annotation}")


# %%
Expand Down
3 changes: 2 additions & 1 deletion docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ Welcome to **atomworks** — a toolkit for converting, parsing, and manipulating
api_reference
auto_examples/index
contributor_guide
mirrors
mirrors
msa
5 changes: 3 additions & 2 deletions docs/ml.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Core Modules
ml/samplers

Data Processing Modules
----------------------
-----------------------

.. toctree::
:maxdepth: 2
Expand All @@ -23,4 +23,5 @@ Data Processing Modules
ml/transforms/dna
ml/transforms/feature_aggregation
ml/transforms/msa
ml/utils
ml/utils
ml/preprocessing
39 changes: 39 additions & 0 deletions docs/ml/preprocessing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,45 @@ Utilities
---------

.. automodule:: atomworks.ml.preprocessing.utils
:members:
:undoc-members:
:show-inheritance:


MSA
---

Note that the following functions can be called via the command line. See :doc:`../msa`
for more details.

Finding
^^^^^^^

.. automodule:: atomworks.ml.preprocessing.msa.finding
:members:
:undoc-members:
:show-inheritance:

Filtering
^^^^^^^^^

.. automodule:: atomworks.ml.preprocessing.msa.filtering
:members:
:undoc-members:
:show-inheritance:

Generating
^^^^^^^^^^

.. automodule:: atomworks.ml.preprocessing.msa.generating
:members:
:undoc-members:
:show-inheritance:

Organizing
^^^^^^^^^^

.. automodule:: atomworks.ml.preprocessing.msa.organizing
:members:
:undoc-members:
:show-inheritance:
31 changes: 31 additions & 0 deletions docs/msa.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Multiple Sequence Alignment in AtomWorks
========================================

AtomWorks provides several command-line tools for Multiple Sequence Alignment (MSA) operations.

--------------

Find
----

.. typer:: atomworks_cli.find:app
:prog: atomworks msa find
:show-nested:

Filter
------
.. typer:: atomworks_cli.filter:app
:prog: atomworks msa filter
:show-nested:

Generate
--------
.. typer:: atomworks_cli.generate:app
:prog: atomworks msa generate
:show-nested:

Organize
--------
.. typer:: atomworks_cli.organize:app
:prog: atomworks msa organize
:show-nested:
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "atomworks"
version = "2.1.1"
version = "2.2.0"
description = "A research-oriented data toolkit for training biomolecular deep-learning foundation models"
readme = "README.md"
requires-python = ">=3.11"
Expand Down
2 changes: 1 addition & 1 deletion src/atomworks/biotite_patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def array(atoms: list[Atom]) -> AtomArray:
for i, atom in enumerate(atoms):
if sorted(atom._annot.keys()) != names:
raise ValueError(
f"The atom at index {i} does not share the same " f"annotation categories as the atom at index 0"
f"The atom at index {i} does not share the same annotation categories as the atom at index 0"
)
array = AtomArray(len(atoms))

Expand Down
13 changes: 8 additions & 5 deletions src/atomworks/io/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def _build_cache_file_path(
def parse(
filename: os.PathLike | io.StringIO | io.BytesIO,
*,
file_type: Literal["cif", "pdb"] | None = None,
file_type: Literal["cif", "pdb", "mmjson"] | None = None,
ccd_mirror_path: os.PathLike | None = CCD_MIRROR_PATH,
cache_dir: os.PathLike | None = None,
save_to_cache: bool = False,
Expand Down Expand Up @@ -163,7 +163,7 @@ def parse(
atomic-level structure (e.g. .cif, .bcif, .cif.gz, .pdb), although .cif files are strongly recommended.

**Wrapper arguments:**
file_type (Literal["cif", "pdb"] | None, optional): The file type of the structure file.
file_type (Literal["cif", "pdb", "mmjson"] | None, optional): The file type of the structure file.
If not provided, the file type will be inferred automatically.
load_from_cache (bool, optional): Whether to load pre-compiled results from cache. Defaults to False.
cache_dir (PathLike, optional): Directory path to save pre-compiled results. Defaults to None.
Expand Down Expand Up @@ -329,9 +329,10 @@ def parse(
build_assembly=build_assembly,
extra_fields=extra_fields,
)
elif file_type in ("cif", "bcif"):
elif file_type in ("cif", "bcif", "mmjson"):
result = _parse_from_cif(
filename=filename,
file_type=file_type,
ccd_mirror_path=ccd_mirror_path,
add_missing_atoms=add_missing_atoms,
add_id_and_entity_annotations=add_id_and_entity_annotations,
Expand Down Expand Up @@ -684,7 +685,9 @@ def parse_atom_array(
return data_dict


def _parse_from_cif(filename: os.PathLike | io.StringIO | io.BytesIO, **kwargs) -> dict[str, Any]:
def _parse_from_cif(
filename: os.PathLike | io.StringIO | io.BytesIO, file_type: str | None = None, **kwargs
) -> dict[str, Any]:
"""Parse the CIF file.

Return chain information, residue information, atom array, and metadata.
Expand All @@ -696,7 +699,7 @@ def _parse_from_cif(filename: os.PathLike | io.StringIO | io.BytesIO, **kwargs)
data_dict = {"extra_info": {}}

# ... read the CIF file into the dictionary (we will clean up the dictionary before returning)
cif_file = read_any(filename)
cif_file = read_any(filename, file_type=file_type)
data_dict["cif_block"] = cif_file.block

# ... load metadata into "metadata" key (either from RCSB standard fields, or from the custom `extra_metadata` field)
Expand Down
Loading
Loading