RosettaCommons · rclune · Dec 9, 2025 · Dec 9, 2025 · Dec 10, 2025 · Dec 10, 2025
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,90 @@
+cff-version: 1.2.0
+title: atomworks
+message: >-
+  If you use this software, please cite it using the
+  metadata from this file.
+type: software
+authors:
+  - family-names: Corley
+    given-names: Nathaniel
+  - family-names: Mathis
+    given-names: Simon
+  - family-names: Krishna
+    given-names: Rohith
+  - family-names: Bauer
+    given-names: Magnus S.
+  - family-names: Thompson
+    given-names: Tuscan R.
+  - family-names: Ahern
+    given-names: Woody
+  - family-names: Kazman
+    given-names: Maxwell W.
+  - family-names: Brent
+    given-names: Rafael I.
+  - family-names: Didi
+    given-names: Kieran
+  - family-names: Kubaney
+    given-names: Andrew
+  - family-names: McHugh
+    given-names: Liam
+  - family-names: Nagle
+    given-names: Andrew
+  - family-names: Favor
+    given-names: Adam
+  - family-names: Kshirsagar
+    given-names: Meghana
+  - family-names: Sturmfels
+    given-names: Pascal
+  - family-names: Li
+    given-names: Yinuo
+  - family-names: Butcher
+    given-names: John
+  - family-names: Qiang
+    given-names: Bo
+  - family-names: Schaaf
+    given-names: Luna L.
+  - family-names: Mitra
+    given-names: Ria
+  - family-names: Campbell
+    given-names: Kerrie
+  - family-names: Zhang
+    given-names: Opa
+  - family-names: Weissman
+    given-names: Rose
+  - family-names: Humphreys
+    given-names: Ian R.
+  - family-names: Cong
+    given-names: Qian
+  - family-names: Jiang
+    given-names: Hanlun
+  - family-names: Funk
+    given-names: Jason
+  - family-names: Sonthalia
+    given-names: Satyaki
+  - family-names: Lio
+    given-names: Pietro
+  - family-names: Baker
+    given-names: David
+  - family-names: DiMaio
+    given-names: Frank
+identifiers:
+  - type: doi
+    value: 10.1101/2025.08.14.670328
+    description: bioRxiv preprint
+repository-code: https://github.com/RosettaCommons/atomworks
+url: https://rosettacommons.github.io/atomworks/latest/
+abstract: >-
+  A research-oriented data toolkit for training biomolecular
+  deep-learning foundation models. AtomWorks provides tools
+  for parsing, cleaning, manipulating, and converting
+  biological data (structures, sequences, small molecules)
+  as well as advanced dataset featurization and sampling for
+  deep learning workflows.
+keywords:
+  - bioinformatics
+  - machine-learning
+  - deep-learning
+  - protein-structure
+  - biotite
+  - structural-biology
+license: BSD-3-Clause
diff --git a/README.md b/README.md
@@ -152,7 +152,7 @@ We thank Hope Woods and Rachel Clune from the Rosetta Commons for their partners
 
 If you make use of AtomWorks in your research, please cite:
 
-> N. Corley\*, S. Mathis\*, R. Krishna\*, M. S. Bauer, T. R. Thompson, W. Ahern, M. W. Kazman, R. I. Brent, K. Didi, A. Kubaney, L. McHugh, A. Nagle, A. Favor, M. Kshirsagar, P. Sturmfels, Y. Li, J. Butcher, B. Qiang, L. L. Schaaf, R. Mitra, K. Campbell, O. Zhang, R. Weissman, I. R. Humphreys, Q. Cong, J. Funk, S. Sonthalia, P. Lio, D. Baker, F. DiMaio,
+> N. Corley\*, S. Mathis\*, R. Krishna\*, M. S. Bauer, T. R. Thompson, W. Ahern, M. W. Kazman, R. I. Brent, K. Didi, A. Kubaney, L. McHugh, A. Nagle, A. Favor, M. Kshirsagar, P. Sturmfels, Y. Li, J. Butcher, B. Qiang, L. L. Schaaf, R. Mitra, K. Campbell, O. Zhang, R. Weissman, I. R. Humphreys, Q. Cong, H. Jiang, J. Funk, S. Sonthalia, P. Lio, D. Baker, F. DiMaio,
 > "Accelerating Biomolecular Modeling with AtomWorks and RF3," bioRxiv, August 2025. doi: [10.1101/2025.08.14.670328](https://doi.org/10.1101/2025.08.14.670328)
 
 If you use bibtex, here's the GoogleScholar formatted citation:

diff --git a/docs/conf.py b/docs/conf.py
@@ -11,6 +11,7 @@
 
 sys.path.insert(0, os.path.abspath("../src"))
 
+
 import atomworks
 
 project = "atomworks"
@@ -44,11 +45,17 @@
     "sphinx.ext.viewcode",  # Add source code links
     "sphinx.ext.napoleon",  # Google/NumPy style docstrings
     "sphinx_gallery.gen_gallery",  # Generates auto_examples/ from examples/
+    #"sphinx_click",
+    "sphinxcontrib.typer"
 ]
 
 templates_path = ["_templates"]
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "examples/GALLERY_HEADER.rst", "ml/preprocessing.rst"]
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "examples/GALLERY_HEADER.rst"]#, "ml/preprocessing.rst"]
 
+#autodoc_mock_imports = [
+#    "zstandard",
+#    "torch",
+#]
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 

diff --git a/docs/docs_requirements.txt b/docs/docs_requirements.txt
@@ -5,4 +5,5 @@ sphinx-autodoc-typehints>=1.20.0,<2
 nbsphinx>=0.8.9,<1
 sphinx-gallery>=0.8.1,<1
 ghp-import>=2.0.0,<3
-pandoc>=2.0.0,<3
+pandoc>=2.0.0,<3
+sphinxcontrib-typer>=0.7.2
diff --git a/docs/examples/load_and_visualize_structures.py b/docs/examples/load_and_visualize_structures.py
@@ -82,7 +82,7 @@
 print("Available annotations:")
 annotations = atom_array.get_annotation_categories()
 for i, annotation in enumerate(annotations):
-    print(f"  {i+1:2d}. {annotation}")
+    print(f"  {i + 1:2d}. {annotation}")
 
 
 # %%

diff --git a/docs/index.rst b/docs/index.rst
@@ -23,4 +23,5 @@ Welcome to **atomworks** — a toolkit for converting, parsing, and manipulating
    api_reference
    auto_examples/index
    contributor_guide
-   mirrors
+   mirrors
+   msa
diff --git a/docs/ml.rst b/docs/ml.rst
@@ -11,7 +11,7 @@ Core Modules
    ml/samplers
 
 Data Processing Modules
-----------------------
+-----------------------
 
 .. toctree::
    :maxdepth: 2
@@ -23,4 +23,5 @@ Data Processing Modules
    ml/transforms/dna
    ml/transforms/feature_aggregation
    ml/transforms/msa
-   ml/utils
+   ml/utils
+   ml/preprocessing
diff --git a/docs/ml/preprocessing.rst b/docs/ml/preprocessing.rst
@@ -23,6 +23,45 @@ Utilities
 ---------
 
 .. automodule:: atomworks.ml.preprocessing.utils
+   :members:
+   :undoc-members:
+   :show-inheritance: 
+
+
+MSA
+---
+
+Note that the following functions can be called via the command line. See :doc:`../msa`
+for more details.
+
+Finding
+^^^^^^^
+
+.. automodule:: atomworks.ml.preprocessing.msa.finding
+   :members:
+   :undoc-members:
+   :show-inheritance: 
+
+Filtering
+^^^^^^^^^
+
+.. automodule:: atomworks.ml.preprocessing.msa.filtering
+   :members:
+   :undoc-members:
+   :show-inheritance: 
+
+Generating
+^^^^^^^^^^
+
+.. automodule:: atomworks.ml.preprocessing.msa.generating
+   :members:
+   :undoc-members:
+   :show-inheritance: 
+
+Organizing
+^^^^^^^^^^
+
+.. automodule:: atomworks.ml.preprocessing.msa.organizing
    :members:
    :undoc-members:
    :show-inheritance: 
diff --git a/docs/msa.rst b/docs/msa.rst
@@ -0,0 +1,31 @@
+Multiple Sequence Alignment in AtomWorks
+========================================
+
+AtomWorks provides several command-line tools for Multiple Sequence Alignment (MSA) operations.
+
+--------------
+
+Find
+----
+
+.. typer:: atomworks_cli.find:app
+    :prog: atomworks msa find
+    :show-nested:
+
+Filter
+------
+.. typer:: atomworks_cli.filter:app
+    :prog: atomworks msa filter
+    :show-nested:
+
+Generate
+--------
+.. typer:: atomworks_cli.generate:app
+    :prog: atomworks msa generate
+    :show-nested:
+
+Organize
+--------
+.. typer:: atomworks_cli.organize:app
+    :prog: atomworks msa organize
+    :show-nested:
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "atomworks"
-version = "2.1.1"
+version = "2.2.0"
 description = "A research-oriented data toolkit for training biomolecular deep-learning foundation models"
 readme = "README.md"
 requires-python = ">=3.11"

diff --git a/src/atomworks/biotite_patch.py b/src/atomworks/biotite_patch.py
@@ -184,7 +184,7 @@ def array(atoms: list[Atom]) -> AtomArray:
         for i, atom in enumerate(atoms):
             if sorted(atom._annot.keys()) != names:
                 raise ValueError(
-                    f"The atom at index {i} does not share the same " f"annotation categories as the atom at index 0"
+                    f"The atom at index {i} does not share the same annotation categories as the atom at index 0"
                 )
         array = AtomArray(len(atoms))
 

diff --git a/src/atomworks/io/parser.py b/src/atomworks/io/parser.py
@@ -125,7 +125,7 @@ def _build_cache_file_path(
 def parse(
     filename: os.PathLike | io.StringIO | io.BytesIO,
     *,
-    file_type: Literal["cif", "pdb"] | None = None,
+    file_type: Literal["cif", "pdb", "mmjson"] | None = None,
     ccd_mirror_path: os.PathLike | None = CCD_MIRROR_PATH,
     cache_dir: os.PathLike | None = None,
     save_to_cache: bool = False,
@@ -163,7 +163,7 @@ def parse(
             atomic-level structure (e.g. .cif, .bcif, .cif.gz, .pdb), although .cif files are strongly recommended.
 
         **Wrapper arguments:**
-        file_type (Literal["cif", "pdb"] | None, optional): The file type of the structure file.
+        file_type (Literal["cif", "pdb", "mmjson"] | None, optional): The file type of the structure file.
             If not provided, the file type will be inferred automatically.
         load_from_cache (bool, optional): Whether to load pre-compiled results from cache. Defaults to False.
         cache_dir (PathLike, optional): Directory path to save pre-compiled results. Defaults to None.
@@ -329,9 +329,10 @@ def parse(
             build_assembly=build_assembly,
             extra_fields=extra_fields,
         )
-    elif file_type in ("cif", "bcif"):
+    elif file_type in ("cif", "bcif", "mmjson"):
         result = _parse_from_cif(
             filename=filename,
+            file_type=file_type,
             ccd_mirror_path=ccd_mirror_path,
             add_missing_atoms=add_missing_atoms,
             add_id_and_entity_annotations=add_id_and_entity_annotations,
@@ -684,7 +685,9 @@ def parse_atom_array(
     return data_dict
 
 
-def _parse_from_cif(filename: os.PathLike | io.StringIO | io.BytesIO, **kwargs) -> dict[str, Any]:
+def _parse_from_cif(
+    filename: os.PathLike | io.StringIO | io.BytesIO, file_type: str | None = None, **kwargs
+) -> dict[str, Any]:
     """Parse the CIF file.
 
     Return chain information, residue information, atom array, and metadata.
@@ -696,7 +699,7 @@ def _parse_from_cif(filename: os.PathLike | io.StringIO | io.BytesIO, **kwargs)
     data_dict = {"extra_info": {}}
 
     # ... read the CIF file into the dictionary (we will clean up the dictionary before returning)
-    cif_file = read_any(filename)
+    cif_file = read_any(filename, file_type=file_type)
     data_dict["cif_block"] = cif_file.block
 
     # ... load metadata into "metadata" key (either from RCSB standard fields, or from the custom `extra_metadata` field)