From 5feb4a7a5438e2837d488a0d292fa581e67db29c Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Tue, 19 Aug 2025 14:09:42 +0200 Subject: [PATCH 01/19] Add MarkupText.from_ & add some string methods --- python/acl_anthology/text/markuptext.py | 101 ++++++++++++++++++++++-- python/tests/text/markuptext_test.py | 22 +++++- 2 files changed, 116 insertions(+), 7 deletions(-) diff --git a/python/acl_anthology/text/markuptext.py b/python/acl_anthology/text/markuptext.py index 18f18fbd2d..7eadc6566d 100644 --- a/python/acl_anthology/text/markuptext.py +++ b/python/acl_anthology/text/markuptext.py @@ -19,8 +19,9 @@ from attrs import define, field from collections import defaultdict from copy import deepcopy +from functools import total_ordering from lxml import etree -from typing import Iterator, Optional +from typing import Iterator, Optional, SupportsIndex from xml.sax.saxutils import escape as xml_escape from ..utils import ( @@ -63,10 +64,30 @@ def markup_to_latex(element: etree._Element) -> str: @define(repr=False) +@total_ordering class MarkupText: """Text with optional markup. - This class **should not be instantiated directly,** but only through its class method constructors. This is because the internal representation of the markup text may change at any time. + Warning: + This class **should not be instantiated directly,** but only through its class method constructors. + + Example: + ```python + title = MarkupText.from_("A Structured Review of the Validity of BLEU") + title = MarkupText.from_("TTCS$^{\\mathcal{E}}$: a Vectorial Resource for Computing Conceptual Similarity") + ``` + + Note: + This class implements a limited subset of string methods to make common operations more convenient, for example: + + ```python + title = MarkupText.from_string("A Structured Review of the Validity of BLEU") + title == "A Structured Review of the Validity of BLEU" # True + "BLEU" in title # True + title.startswith("A ") # True + ``` + + In all cases, if a string is involved, these operate on the [plain text representation][acl_anthology.text.markuptext.MarkupText.as_text] of the class, which disregards all markup. If comparing against another MarkupText object, the markup will be taken into account as well. """ # IMPLEMENTATION NOTE: Deepcopy-ing (or newly instantiating) etree._Element @@ -79,10 +100,29 @@ class MarkupText: _content: etree._Element | str = field() # For caching - _html: Optional[str] = field(init=False, default=None) - _latex: Optional[str] = field(init=False, default=None) - _text: Optional[str] = field(init=False, default=None) - _xml: Optional[str] = field(init=False, default=None) + _html: Optional[str] = field(init=False, default=None, eq=False) + _latex: Optional[str] = field(init=False, default=None, eq=False) + _text: Optional[str] = field(init=False, default=None, eq=False) + _xml: Optional[str] = field(init=False, default=None, eq=False) + + def __contains__(self, key: object) -> bool: + if isinstance(key, str): + return key in self.as_text() + return False + + def __eq__(self, other: object) -> bool: + if isinstance(other, str): + return self.as_text() == other + elif isinstance(other, MarkupText): + return self.as_xml() == other.as_xml() + return False + + def __lt__(self, other: object) -> bool: + if isinstance(other, str): + return self.as_text() < other + elif isinstance(other, MarkupText): + return self.as_xml() < other.as_xml() + return False def __str__(self) -> str: return self.as_text() @@ -231,6 +271,29 @@ def from_xml(cls, element: etree._Element) -> MarkupText: else: return cls(str(element.text) if element.text is not None else "") + @classmethod + def from_(cls, content: etree._Element | str) -> MarkupText: + """Instantiate MarkupText from an XML element or a string, heuristically parsing any supported markup. + + - If called with an XML element, assumes it uses the Anthology's markup format and calls [`from_xml()`][acl_anthology.text.markuptext.MarkupText.from_xml]. + - If called with a string, assumes the string might contain markup and will try to intelligently parse it. At the moment, only LaTeX markup is supported, which means that the effect of this is identical to calling [`from_latex_maybe()`][acl_anthology.text.markuptext.MarkupText.from_latex_maybe], but this may change if we support different types of markup in the future. + + Note: + If you want more fine-grained control over markup detection, call one of the more specific builder functions instead. + + Arguments: + content: A string potentially containing markup, or an XML element containing valid MarkupText according to the schema. + + Returns: + Instantiated MarkupText object corresponding to the content. + """ + if isinstance(content, etree._Element): + return cls.from_xml(content) + elif isinstance(content, str): + return cls.from_latex_maybe(content) + else: # pragma: no cover + raise TypeError(f"Cannot instantiate MarkupText from {type(content)}") + def to_xml(self, tag: str = "span") -> etree._Element: """ Arguments: @@ -246,3 +309,29 @@ def to_xml(self, tag: str = "span") -> etree._Element: element = deepcopy(self._content) element.tag = tag return element + + ### STRING METHODS + + def endswith( + self, + suffix: str | tuple[str, ...], + start: Optional[SupportsIndex] = None, + end: Optional[SupportsIndex] = None, + ) -> bool: + """Return True if the string ends with the specified suffix, False otherwise. + + Equivalent to `self.as_text().endswith(...)`. + """ + return self.as_text().endswith(suffix, start, end) + + def startswith( + self, + prefix: str | tuple[str, ...], + start: Optional[SupportsIndex] = None, + end: Optional[SupportsIndex] = None, + ) -> bool: + """Return True if the string starts with the specified prefix, False otherwise. + + Equivalent to `self.as_text().startswith(...)`. + """ + return self.as_text().startswith(prefix, start, end) diff --git a/python/tests/text/markuptext_test.py b/python/tests/text/markuptext_test.py index 48a72f48a9..fde88c1894 100644 --- a/python/tests/text/markuptext_test.py +++ b/python/tests/text/markuptext_test.py @@ -205,11 +205,12 @@ def test_markup_from_xml(inp, out): assert markup.as_html() == out["html"] assert markup.as_latex() == out["latex"] assert markup.as_xml() == inp + assert MarkupText.from_(element) == markup assert etree.tostring(markup.to_xml("title"), encoding="unicode") == xml assert markup.contains_markup == ("<" in out["html"]) -def test_simple_string(): +def test_markup_from_simple_string(): text = "Some ASCII text without markup" markup = MarkupText.from_string(text) assert not markup.contains_markup @@ -217,10 +218,12 @@ def test_simple_string(): assert markup.as_html() == text assert markup.as_latex() == text assert markup.as_xml() == text + assert markup == text assert ( etree.tostring(markup.to_xml("span"), encoding="unicode") == f"{text}" ) + assert MarkupText.from_(text) == markup test_cases_markup_from_latex = ( @@ -424,3 +427,20 @@ def test_markup_from_latex_maybe(inp, out1, out2): assert markup.as_xml() == out1 markup = MarkupText.from_latex_maybe(inp) assert markup.as_xml() == out2 + + +def test_markup_behaves_like_string(): + markup = MarkupText.from_latex( + "TTCS$^{\mathcal{E}}$: a Vectorial Resource for Computing Conceptual Similarity" + ) + plain_text = "TTCSℰ: a Vectorial Resource for Computing Conceptual Similarity" + assert markup == plain_text # disregards markup + assert markup != MarkupText.from_string(plain_text) # does not disregard markup + assert markup == MarkupText.from_latex( + "TTCS$^{\mathcal{E}}$: a Vectorial Resource for Computing Conceptual Similarity" + ) + assert "Vectorial" in markup + assert markup.startswith("TTCS") + assert markup.endswith("Similarity") + assert markup < "XTCS" + assert markup < MarkupText.from_string("XTCS") From 6249922d01876519cb0edb94f78aabd2da542538 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Tue, 19 Aug 2025 15:31:59 +0200 Subject: [PATCH 02/19] Use automatic converters for MarkupText attributes, change string method logic --- .../acl_anthology/collections/collection.py | 4 +- python/acl_anthology/collections/event.py | 10 +++-- python/acl_anthology/collections/paper.py | 10 +++-- python/acl_anthology/collections/volume.py | 22 +++++++--- python/acl_anthology/text/__init__.py | 4 +- python/acl_anthology/text/markuptext.py | 40 ++++++++++------- python/tests/collections/collection_test.py | 10 ++++- python/tests/collections/event_test.py | 2 +- python/tests/collections/paper_test.py | 7 ++- python/tests/collections/volume_test.py | 44 ++++++++++--------- python/tests/text/markuptext_test.py | 16 +++++-- 11 files changed, 104 insertions(+), 65 deletions(-) diff --git a/python/acl_anthology/collections/collection.py b/python/acl_anthology/collections/collection.py index 29615e78f7..5c81ec64b5 100644 --- a/python/acl_anthology/collections/collection.py +++ b/python/acl_anthology/collections/collection.py @@ -148,7 +148,7 @@ def validate_schema(self) -> Self: def create_volume( self, id: str, - title: MarkupText, + title: MarkupText | str, year: Optional[str] = None, type: VolumeType = VolumeType.PROCEEDINGS, **kwargs: Any, @@ -157,7 +157,7 @@ def create_volume( Parameters: id: The ID of the new volume. - title: The title of the new volume. + title: The title of the new volume. If given as a string, it will be [heuristically parsed for markup][acl_anthology.text.markuptext.MarkupText.from_]. year: The year of the new volume (optional); if None, will infer the year from this collection's ID. type: Whether this is a journal or proceedings volume; defaults to [VolumeType.PROCEEDINGS][acl_anthology.collections.types.VolumeType]. **kwargs: Any valid list or optional attribute of [Volume][acl_anthology.collections.volume.Volume]. diff --git a/python/acl_anthology/collections/event.py b/python/acl_anthology/collections/event.py index e05a3208f3..3557de023c 100644 --- a/python/acl_anthology/collections/event.py +++ b/python/acl_anthology/collections/event.py @@ -15,7 +15,7 @@ from __future__ import annotations -from attrs import define, field, validators as v +from attrs import define, field, converters, validators as v from lxml import etree from lxml.builder import E from typing import Any, Iterator, Optional, TYPE_CHECKING @@ -24,7 +24,7 @@ from ..constants import RE_EVENT_ID from ..files import EventFileReference from ..people import NameSpecification -from ..text import MarkupText +from ..text import MarkupText, to_markuptext from ..utils.attrs import auto_validate_types from ..utils.ids import AnthologyID, AnthologyIDTuple, parse_id, build_id_from_tuple @@ -44,7 +44,7 @@ class Talk: attachments: Links to attachments for this talk. The dictionary key specifies the type of attachment (e.g., "video" or "slides"). """ - title: MarkupText = field() + title: MarkupText = field(converter=to_markuptext) type: Optional[str] = field(default=None) speakers: list[NameSpecification] = field(factory=list) attachments: dict[str, EventFileReference] = field(factory=dict) @@ -128,7 +128,9 @@ class Event: ), ) - title: Optional[MarkupText] = field(default=None) + title: Optional[MarkupText] = field( + default=None, converter=converters.optional(to_markuptext) + ) location: Optional[str] = field(default=None) dates: Optional[str] = field(default=None) diff --git a/python/acl_anthology/collections/paper.py b/python/acl_anthology/collections/paper.py index b95e04896a..75212925ba 100644 --- a/python/acl_anthology/collections/paper.py +++ b/python/acl_anthology/collections/paper.py @@ -15,7 +15,7 @@ from __future__ import annotations import attrs -from attrs import define, field, validators as v +from attrs import define, field, converters, validators as v import datetime from functools import cached_property import langcodes @@ -34,7 +34,7 @@ VideoReference, ) from ..people import NameSpecification -from ..text import MarkupText +from ..text import MarkupText, to_markuptext from ..utils.attrs import auto_validate_types, date_to_str, int_to_str from ..utils.citation import citeproc_render_html, render_acl_citation from ..utils.ids import build_id, is_valid_item_id, AnthologyIDTuple @@ -257,7 +257,7 @@ class Paper: bibkey: str = field( on_setattr=attrs.setters.pipe(attrs.setters.validate, _update_bibkey_index), ) - title: MarkupText = field() + title: MarkupText = field(converter=to_markuptext) attachments: list[tuple[str, AttachmentReference]] = field( factory=list, @@ -289,7 +289,9 @@ class Paper: ) videos: list[VideoReference] = field(factory=list, repr=False) - abstract: Optional[MarkupText] = field(default=None) + abstract: Optional[MarkupText] = field( + default=None, converter=converters.optional(to_markuptext) + ) deletion: Optional[PaperDeletionNotice] = field( default=None, repr=False, validator=v.optional(v.instance_of(PaperDeletionNotice)) ) diff --git a/python/acl_anthology/collections/volume.py b/python/acl_anthology/collections/volume.py index 07d3857d5b..f50b573e30 100644 --- a/python/acl_anthology/collections/volume.py +++ b/python/acl_anthology/collections/volume.py @@ -15,7 +15,7 @@ from __future__ import annotations import datetime -from attrs import define, field, validators +from attrs import define, field, converters, validators from lxml import etree from lxml.builder import E from typing import Any, Iterator, Optional, cast, TYPE_CHECKING @@ -27,7 +27,7 @@ from ..exceptions import AnthologyDuplicateIDError, AnthologyInvalidIDError from ..files import PDFReference from ..people import NameSpecification -from ..text import MarkupText +from ..text import MarkupText, to_markuptext from ..venues import Venue from ..utils.attrs import auto_validate_types, date_to_str, int_to_str from ..utils.ids import build_id, is_valid_item_id, AnthologyIDTuple @@ -77,7 +77,7 @@ class Volume(SlottedDict[Paper]): id: str = field(converter=int_to_str) # validator defined below parent: Collection = field(repr=False, eq=False) type: VolumeType = field(repr=False, converter=VolumeType) - title: MarkupText = field(alias="booktitle") + title: MarkupText = field(alias="booktitle", converter=to_markuptext) year: str = field( converter=int_to_str, validator=validators.matches_re(r"^[0-9]{4}$") ) @@ -101,7 +101,10 @@ class Volume(SlottedDict[Paper]): pdf: Optional[PDFReference] = field(default=None, repr=False) publisher: Optional[str] = field(default=None, repr=False) shorttitle: Optional[MarkupText] = field( - default=None, alias="shortbooktitle", repr=False + default=None, + alias="shortbooktitle", + repr=False, + converter=converters.optional(to_markuptext), ) @id.validator @@ -244,7 +247,7 @@ def generate_paper_id(self) -> str: def create_paper( self, - title: MarkupText, + title: MarkupText | str, id: Optional[str] = None, bibkey: str = constants.NO_BIBKEY, **kwargs: Any, @@ -252,7 +255,7 @@ def create_paper( """Create a new [Paper][acl_anthology.collections.paper.Paper] object in this volume. Parameters: - title: The title of the new paper. + title: The title of the new paper. If given as a string, it will be [heuristically parsed for markup][acl_anthology.text.markuptext.MarkupText.from_]. id: The ID of the new paper (optional); if None, will generate the next-highest numeric ID that doesn't already exist in this volume. bibkey: The citation key of the new paper (optional); defaults to [`constants.NO_BIBKEY`][acl_anthology.constants.NO_BIBKEY], in which case a non-clashing citation key will be automatically generated (recommended!). **kwargs: Any valid list or optional attribute of [Paper][acl_anthology.collections.paper.Paper]. @@ -271,7 +274,12 @@ def create_paper( ) kwargs["parent"] = self - paper = Paper(id=id, bibkey=bibkey, title=title, **kwargs) + paper = Paper( + id=id, + bibkey=bibkey, + title=title, + **kwargs, + ) # Necessary because on_setattr is not called during initialization: paper.bibkey = bibkey # triggers bibkey generating (if necessary) & indexing diff --git a/python/acl_anthology/text/__init__.py b/python/acl_anthology/text/__init__.py index 5da2f2937d..f05432c68d 100644 --- a/python/acl_anthology/text/__init__.py +++ b/python/acl_anthology/text/__init__.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .markuptext import MarkupText +from .markuptext import MarkupText, to_markuptext from .stopwords import StopWords from .texmath import TexMath -__all__ = ["MarkupText", "StopWords", "TexMath"] +__all__ = ["MarkupText", "StopWords", "TexMath", "to_markuptext"] diff --git a/python/acl_anthology/text/markuptext.py b/python/acl_anthology/text/markuptext.py index 7eadc6566d..a235f196e1 100644 --- a/python/acl_anthology/text/markuptext.py +++ b/python/acl_anthology/text/markuptext.py @@ -16,7 +16,7 @@ from __future__ import annotations -from attrs import define, field +from attrs import define, field, validators as v from collections import defaultdict from copy import deepcopy from functools import total_ordering @@ -69,7 +69,7 @@ class MarkupText: """Text with optional markup. Warning: - This class **should not be instantiated directly,** but only through its class method constructors. + This class **should not be instantiated directly.** Use its class method constructors instead. Example: ```python @@ -87,7 +87,7 @@ class MarkupText: title.startswith("A ") # True ``` - In all cases, if a string is involved, these operate on the [plain text representation][acl_anthology.text.markuptext.MarkupText.as_text] of the class, which disregards all markup. If comparing against another MarkupText object, the markup will be taken into account as well. + These operate on the [stringified XML representation][acl_anthology.text.markuptext.MarkupText.as_xml] of the class. """ # IMPLEMENTATION NOTE: Deepcopy-ing (or newly instantiating) etree._Element @@ -97,29 +97,29 @@ class MarkupText: # check everywhere whether we're dealing with etree._Element or str), but # much faster. ---For further optimization, we could explore if there's an # alternative that doesn't require deepcopy-ing XML elements at all. - _content: etree._Element | str = field() + _content: etree._Element | str = field(validator=v.instance_of((etree._Element, str))) # For caching - _html: Optional[str] = field(init=False, default=None, eq=False) - _latex: Optional[str] = field(init=False, default=None, eq=False) - _text: Optional[str] = field(init=False, default=None, eq=False) - _xml: Optional[str] = field(init=False, default=None, eq=False) + _html: Optional[str] = field(init=False, default=None) + _latex: Optional[str] = field(init=False, default=None) + _text: Optional[str] = field(init=False, default=None) + _xml: Optional[str] = field(init=False, default=None) def __contains__(self, key: object) -> bool: if isinstance(key, str): - return key in self.as_text() + return key in self.as_xml() return False def __eq__(self, other: object) -> bool: if isinstance(other, str): - return self.as_text() == other + return self.as_xml() == other elif isinstance(other, MarkupText): return self.as_xml() == other.as_xml() return False def __lt__(self, other: object) -> bool: if isinstance(other, str): - return self.as_text() < other + return self.as_xml() < other elif isinstance(other, MarkupText): return self.as_xml() < other.as_xml() return False @@ -268,8 +268,7 @@ def from_xml(cls, element: etree._Element) -> MarkupText: """ if len(element): return cls(deepcopy(element)) - else: - return cls(str(element.text) if element.text is not None else "") + return cls(str(element.text) if element.text is not None else "") @classmethod def from_(cls, content: etree._Element | str) -> MarkupText: @@ -320,9 +319,9 @@ def endswith( ) -> bool: """Return True if the string ends with the specified suffix, False otherwise. - Equivalent to `self.as_text().endswith(...)`. + Equivalent to `self.as_xml().endswith(...)`. """ - return self.as_text().endswith(suffix, start, end) + return self.as_xml().endswith(suffix, start, end) def startswith( self, @@ -332,6 +331,13 @@ def startswith( ) -> bool: """Return True if the string starts with the specified prefix, False otherwise. - Equivalent to `self.as_text().startswith(...)`. + Equivalent to `self.as_xml().startswith(...)`. """ - return self.as_text().startswith(prefix, start, end) + return self.as_xml().startswith(prefix, start, end) + + +def to_markuptext(value: object) -> MarkupText: + if isinstance(value, MarkupText): + return value + # MarkupText.from_ already raises TypeError, so let's not duplicate that here + return MarkupText.from_(value) # type: ignore[arg-type] diff --git a/python/tests/collections/collection_test.py b/python/tests/collections/collection_test.py index 895cd4042d..205b39a88c 100644 --- a/python/tests/collections/collection_test.py +++ b/python/tests/collections/collection_test.py @@ -143,11 +143,12 @@ def test_collection_create_volume_implicit(collection_index): collection = collection_index.get("2022.acl") volume = collection.create_volume( "keynotes", - title=MarkupText.from_string("Keynotes from ACL 2022"), + title="Keynotes from ACL 2022", ) assert volume.id in collection assert volume.year == "2022" assert volume.id == "keynotes" + assert volume.title == "Keynotes from ACL 2022" assert volume.full_id == "2022.acl-keynotes" assert volume.type == VolumeType.PROCEEDINGS @@ -165,12 +166,19 @@ def test_collection_create_volume_explicit(collection_index): assert volume.id in collection assert volume.year == "1989" assert volume.id == "99" + assert volume.title == "Special Issue" assert volume.full_id == "1989.cl-99" assert volume.type == VolumeType.JOURNAL assert volume.journal_issue == "99" assert "cl" in volume.venue_ids +def test_collection_create_volume_should_parse_markup(collection_index): + collection = collection_index.get("2022.acl") + volume = collection.create_volume("infinity", title="Special issue on $\\infty$") + assert volume.title.as_text() == "Special issue on ∞" + + def test_collection_create_volume_should_fail_in_oldstyle_volumes(collection_index): collection = collection_index.get("L06") with pytest.raises(ValueError): diff --git a/python/tests/collections/event_test.py b/python/tests/collections/event_test.py index 47bf1aeb18..58a8a81c5d 100644 --- a/python/tests/collections/event_test.py +++ b/python/tests/collections/event_test.py @@ -95,7 +95,7 @@ def test_event_all_attribs(): (("2023.baz", "1", None), EventLinkingType.EXPLICIT), (("2023.asdf", "1", None), EventLinkingType.EXPLICIT), ], - talks=[Talk(MarkupText.from_string("Invited talk"))], + talks=[Talk("Invited talk")], links={"Website": EventFileReference("http://foobar.com")}, ) assert event.collection_id == "2023.li" diff --git a/python/tests/collections/paper_test.py b/python/tests/collections/paper_test.py index 8e4ff3502d..99f1d55a82 100644 --- a/python/tests/collections/paper_test.py +++ b/python/tests/collections/paper_test.py @@ -45,11 +45,10 @@ def index(anthology_stub): def test_paper_minimum_attribs(): - paper_title = MarkupText.from_string("A minimal example") parent = None - paper = Paper("42", parent, bibkey="nn-1900-minimal", title=paper_title) + paper = Paper("42", parent, bibkey="nn-1900-minimal", title="A minimal example") assert not paper.is_deleted - assert paper.title == paper_title + assert paper.title == "A minimal example" def test_paper_web_url(anthology): @@ -156,7 +155,7 @@ def test_paper_language(anthology, paper_id, language, language_name): def test_paper_bibtype(): volume = VolumeStub() volume.type = VolumeType.JOURNAL - paper = Paper("1", volume, bibkey="", title=MarkupText.from_string("")) + paper = Paper("1", volume, bibkey="", title="") assert paper.bibtype == "article" volume.type = VolumeType.PROCEEDINGS assert paper.bibtype == "inproceedings" diff --git a/python/tests/collections/volume_test.py b/python/tests/collections/volume_test.py index f57d7d65c0..d9aff7cd22 100644 --- a/python/tests/collections/volume_test.py +++ b/python/tests/collections/volume_test.py @@ -118,31 +118,28 @@ def __init__(self, parent): def test_volume_minimum_attribs(): - volume_title = MarkupText.from_string("Lorem ipsum") parent = Collection("L05", None, Path(".")) volume = Volume( "6", parent, type=VolumeType.JOURNAL, - booktitle=volume_title, + booktitle="Lorem ipsum", venue_ids=["li"], year="2005", ) assert volume.full_id == "L05-6" - assert volume.title == volume_title + assert volume.title == "Lorem ipsum" assert volume.get_ingest_date().year == 1900 assert not volume.is_workshop def test_volume_all_attribs(): - volume_title = MarkupText.from_string("Lorem ipsum") - volume_shorttitle = MarkupText.from_string("L.I.") parent = Collection("2023.acl", None, Path(".")) volume = Volume( id="long", parent=parent, type="proceedings", - booktitle=volume_title, + booktitle="Lorem ipsum", year="2023", address="Online", doi="10.100/0000", @@ -152,7 +149,7 @@ def test_volume_all_attribs(): month="jan", pdf=None, publisher="Myself", - shortbooktitle=volume_shorttitle, + shortbooktitle="L.I.", venue_ids=["li", "acl"], ) assert volume.ingest_date == "2023-01-12" @@ -256,13 +253,12 @@ def test_volume_venues_naloma(anthology): def test_volume_with_nonexistent_venue(anthology): - volume_title = MarkupText.from_string("Lorem ipsum") parent = Collection("L05", CollectionIndexStub(anthology), Path(".")) volume = Volume( "42", parent, type=VolumeType.JOURNAL, - booktitle=volume_title, + booktitle="Lorem ipsum", venue_ids=["doesntexist"], year="2005", ) @@ -339,7 +335,7 @@ def test_volume_generate_paper_id(anthology): volume.create_paper( id="604", bibkey="my-awesome-paper", - title=MarkupText.from_string("The awesome paper I have never written"), + title="The awesome paper I have never written", ) assert volume.generate_paper_id() == "605" @@ -348,7 +344,7 @@ def test_volume_create_paper_implicit(anthology): volume = anthology.get_volume("2022.acl-long") authors = [NameSpec("Bollmann, Marcel")] paper = volume.create_paper( - title=MarkupText.from_string("The awesome paper I have never written"), + title="The awesome paper I have never written", authors=authors, ingest_date="2025-01-07", ) @@ -368,7 +364,7 @@ def test_volume_create_paper_explicit(anthology): volume = anthology.get_volume("2022.acl-long") authors = [NameSpec("Bollmann, Marcel")] paper = volume.create_paper( - title=MarkupText.from_string("The awesome paper I have never written"), + title="The awesome paper I have never written", authors=authors, ingest_date="2025-01-07", id="701", @@ -389,22 +385,30 @@ def test_volume_create_paper_with_duplicate_id_should_fail(anthology): authors = [NameSpec("Bollmann, Marcel")] with pytest.raises(ValueError): _ = volume.create_paper( - title=MarkupText.from_string("The awesome paper I have never written"), + title="The awesome paper I have never written", authors=authors, - ingest_date="2025-01-07", id="42", ) +def test_volume_create_paper_should_parse_markup(anthology): + volume = anthology.get_volume("2022.acl-long") + authors = [NameSpec("Bollmann, Marcel")] + paper = volume.create_paper( + title="Towards $\\infty$", + authors=authors, + ) + assert paper.title.as_text() == "Towards ∞" + + def test_volume_create_paper_with_editors(anthology): volume = anthology.get_volume("2022.acl-long") # For most papers, the editors are the volume's editors authors = [NameSpec("Bollmann, Marcel")] paper = volume.create_paper( - title=MarkupText.from_string("The awesome paper I have never written"), + title="The awesome paper I have never written", authors=authors, - ingest_date="2025-01-07", ) assert not paper.editors assert paper.get_editors() == volume.editors @@ -412,7 +416,7 @@ def test_volume_create_paper_with_editors(anthology): # But the schema allows paper-level editors too editors = [NameSpec("Calzolari, Nicoletta")] paper = volume.create_paper( - title=MarkupText.from_string("The awesome paper I have never written"), + title="The awesome paper I have never written", authors=authors, editors=editors, ingest_date="2025-01-07", @@ -429,7 +433,7 @@ def test_volume_create_paper_should_update_person(anthology, pre_load): volume = anthology.get_volume("2022.acl-long") authors = [NameSpec("Berg-Kirkpatrick, Taylor")] paper = volume.create_paper( - title=MarkupText.from_string("The awesome paper I have never written"), + title="The awesome paper I have never written", authors=authors, ingest_date="2025-01-07", ) @@ -448,7 +452,7 @@ def test_volume_create_paper_should_update_personindex(anthology, pre_load): volume = anthology.get_volume("2022.acl-long") authors = [NameSpec("Nonexistant, Guy Absolutely")] paper = volume.create_paper( - title=MarkupText.from_string("An entirely imaginary paper"), + title="An entirely imaginary paper", authors=authors, ingest_date="2025-01-07", ) @@ -496,7 +500,7 @@ def test_volume_type_conversion(): 6, parent, type="journal", - booktitle=MarkupText.from_string("Lorem ipsum"), # "Lorem ipsum", + booktitle="Lorem ipsum", year=2005, ) assert volume.id == "6" # str diff --git a/python/tests/text/markuptext_test.py b/python/tests/text/markuptext_test.py index fde88c1894..e1d8558140 100644 --- a/python/tests/text/markuptext_test.py +++ b/python/tests/text/markuptext_test.py @@ -433,9 +433,10 @@ def test_markup_behaves_like_string(): markup = MarkupText.from_latex( "TTCS$^{\mathcal{E}}$: a Vectorial Resource for Computing Conceptual Similarity" ) - plain_text = "TTCSℰ: a Vectorial Resource for Computing Conceptual Similarity" - assert markup == plain_text # disregards markup - assert markup != MarkupText.from_string(plain_text) # does not disregard markup + assert ( + markup + == "TTCS^{\mathcal{E}}: a Vectorial Resource for Computing Conceptual Similarity" + ) assert markup == MarkupText.from_latex( "TTCS$^{\mathcal{E}}$: a Vectorial Resource for Computing Conceptual Similarity" ) @@ -444,3 +445,12 @@ def test_markup_behaves_like_string(): assert markup.endswith("Similarity") assert markup < "XTCS" assert markup < MarkupText.from_string("XTCS") + + +def test_markup_cannot_be_instantiated_from_unsupported_types(): + with pytest.raises(TypeError): + MarkupText(42) + with pytest.raises(TypeError): + MarkupText(["foo", "bar"]) + with pytest.raises(TypeError): + MarkupText(MarkupText("foo")) From 66389b214ca989d26dd733293e81f56049a58ddb Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Tue, 19 Aug 2025 18:13:45 +0200 Subject: [PATCH 03/19] Update documentation for MarkupText --- python/acl_anthology/collections/paper.py | 7 +++- python/docs/guide/modifying-data.md | 49 +++++++++++++++++++---- python/docs/guide/types-of-metadata.md | 13 +++++- 3 files changed, 59 insertions(+), 10 deletions(-) diff --git a/python/acl_anthology/collections/paper.py b/python/acl_anthology/collections/paper.py index 75212925ba..7945549489 100644 --- a/python/acl_anthology/collections/paper.py +++ b/python/acl_anthology/collections/paper.py @@ -296,7 +296,12 @@ class Paper: default=None, repr=False, validator=v.optional(v.instance_of(PaperDeletionNotice)) ) doi: Optional[str] = field(default=None, repr=False) - ingest_date: Optional[str] = field(default=None, repr=False) + ingest_date: Optional[str] = field( + default=None, + repr=False, + converter=date_to_str, + validator=v.optional(v.matches_re(constants.RE_ISO_DATE)), + ) issue: Optional[str] = field(default=None, repr=False) journal: Optional[str] = field(default=None, repr=False) language: Optional[str] = field(default=None, repr=False) diff --git a/python/docs/guide/modifying-data.md b/python/docs/guide/modifying-data.md index ab1c21db7d..7de425b599 100644 --- a/python/docs/guide/modifying-data.md +++ b/python/docs/guide/modifying-data.md @@ -43,22 +43,33 @@ just fetch the paper and set its `doi` attribute: !!! tip "Rule of thumb" - For all `collections` objects, setting attributes should either raise a `TypeError`, or "do the right thing." However, be careful when modifying _list_ attributes in-place, as no input validation is performed in that case. + As a general rule, all classes perform **automatic input conversion and validation**. This means that setting attributes should either "do the right thing" or raise a `TypeError`. The main exception currently is modifying attributes in-place, e.g. appending to lists, as no input validation is performed then. ### Simple attributes Attributes generally perform **input validation**. For example, since a paper's -title needs to be a [`MarkupText`][acl_anthology.text.markuptext.MarkupText], -the following won't work and will raise a `TypeError`: +PDF attribute needs to be an instance of +[`PDFReference`][acl_anthology.files.PDFReference], trying to set it to a path +won't work and will raise a `TypeError`: ```pycon ->>> paper.title = "Computational Historical Linguistics and Language Diversity" +>>> paper.pdf = Path("2025.test-1.pdf") # TypeError ``` However, some attributes also provide **input converters** that perform simpler -conversions automatically. For example, a -[`Volume`][acl_anthology.collections.volume.Volume]'s year of publication or -date of ingestion is stored as a string, but the following will also work: +conversions automatically. For example, paper titles and abstracts are stored +as [`MarkupText`][acl_anthology.text.markuptext.MarkupText] objects, but setting +such an attribute to a string will automatically convert it: + +```pycon +>>> paper.title = "Improving the ACL Anthology" +>>> paper.title +MarkupText('Improving the ACL Anthology') +``` + +The same applies to a [`Volume`][acl_anthology.collections.volume.Volume]'s year +of publication or date of ingestion, which both are stored as strings, but the +following will also work: ```pycon >>> volume = anthology.get("2022.acl-long") @@ -226,7 +237,29 @@ bibkeys automatically. ### Specifying titles and abstracts -Paper titles and abstracts always need to be **supplied as [MarkupText][acl_anthology.text.markuptext.MarkupText]**. Simple strings can be instantiated with [`MarkupText.from_string()`][acl_anthology.text.markuptext.MarkupText.from_string]. For titles and abstracts containing LaTeX commands, [`MarkupText.from_latex()`][acl_anthology.text.markuptext.MarkupText.from_latex] can be used. In practice, however, it may be unknown if text actually contains LaTeX markup. In that case, using [`MarkupText.from_latex_maybe()`][acl_anthology.text.markuptext.MarkupText.from_latex_maybe] may be preferable, which will e.g. prevent percentage signs `%` from being interpreted as starting a LaTeX comment, and apply a heuristic to decide if a tilde `~` should be interpreted as a literal character or as a LaTeX non-breaking space. {==TODO: We might want to make `as_latex_maybe()` the default instantiator for MarkupText, which would greatly simplify the instantiation of this in what is probably the most common use case.==} +Paper titles and abstracts are stored internally as [MarkupText][acl_anthology.text.markuptext.MarkupText], but it is possible to simply set them to a string value, in which case **heuristic markup conversion will be performed**. Generally, for setting attributes that expect MarkupText, the following applies: + +- Supplying a string value `s` is equivalent to using [`MarkupText.from_(s)`][acl_anthology.text.markuptext.MarkupText.from_], which will parse and convert simple markup. Currently, only LaTeX markup is supported. + +- If the heuristic conversion is not desired (or you want to make more explicit that you're converting from LaTeX), other builder methods of MarkupText can be used, such as [`MarkupText.from_latex()`][acl_anthology.text.markuptext.MarkupText.from_latex], or [`MarkupText.from_string()`][acl_anthology.text.markuptext.MarkupText.from_string] for plain strings + +!!! example + + Setting a paper's title to a string automatically parses LaTeX markup contained in the string: + + ```pycon + >>> paper.title = "Towards $\\infty$" + >>> paper.title + MarkupText('Towards \\infty') + ``` + + If this is not desired, MarkupText can be explicitly instantiated with one of its builder methods, for example: + + ```pycon + >>> paper.title = MarkupText.from_string("Towards $\\infty$") + >>> paper.title # No markup parsing here + MarkupText('Towards $\\infty$') + ``` Paper titles should also have our **fixed-casing algorithm** applied to protect certain characters e.g. by wrapping them in braces in BibTeX entries. **The fixed-caser is currently not part of this Python library.** There are two options for running the fixed-casing on a new ingestion: diff --git a/python/docs/guide/types-of-metadata.md b/python/docs/guide/types-of-metadata.md index 5a3f6f1346..4c02f122ed 100644 --- a/python/docs/guide/types-of-metadata.md +++ b/python/docs/guide/types-of-metadata.md @@ -26,6 +26,9 @@ paper.attachments # is {} paper.language # is None ``` +When modifying metadata fields, most of them will [perform input validation +and/or conversion](modifying-data.md#modifying-publications). + ## Markup text Two of the most common field types, **titles** and **abstracts**, can contain @@ -43,7 +46,15 @@ This means that such metadata fields don't contain strings, but rather MarkupText('S4-Tuning: A Simple Cross-lingual Sub-network Tuning Method') ``` -To work with these as Unicode strings, you need to explicitly convert them: +For convenience, they support a limited subset of string methods, for example: + +```pycon +>>> paper.title.startswith("S") # True +>>> "Tuning" in paper.title # True +``` + +If MarkupText is converted to a string, any containing markup will be discarded: + ```pycon >>> str(paper.title) 'S4-Tuning: A Simple Cross-lingual Sub-network Tuning Method' From 89db51df6d6b70ce93f6f00bb67bb1cf17d86e51 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Tue, 19 Aug 2025 23:34:48 +0200 Subject: [PATCH 04/19] Add Anthology.save_all (partially functional) --- python/acl_anthology/anthology.py | 14 +++++++++ .../acl_anthology/collections/collection.py | 2 ++ python/acl_anthology/sigs.py | 5 ++++ python/tests/anthology_test.py | 29 ++++++++++++++++++- 4 files changed, 49 insertions(+), 1 deletion(-) diff --git a/python/acl_anthology/anthology.py b/python/acl_anthology/anthology.py index 2f3135e42c..2d93c1dfd7 100644 --- a/python/acl_anthology/anthology.py +++ b/python/acl_anthology/anthology.py @@ -180,6 +180,20 @@ def load_all(self) -> Self: ) return self + def save_all(self) -> Self: + """Save all Anthology data files.""" + for collection in self.collections.values(): + if collection.is_modified: + collection.save() + self.people.save() + self.venues.save() + warnings.warn( + UserWarning( + "SIG metadata is not yet automatically saved. Call `.sigs.save()` manually if you need this." + ) + ) + return self + def reset_indices(self) -> Self: """Reset all non-collection indices. diff --git a/python/acl_anthology/collections/collection.py b/python/acl_anthology/collections/collection.py index 5c81ec64b5..2ae4243593 100644 --- a/python/acl_anthology/collections/collection.py +++ b/python/acl_anthology/collections/collection.py @@ -63,6 +63,7 @@ class Collection(SlottedDict[Volume]): Attributes: Non-Init Attributes: event: An event represented by this collection. is_data_loaded: A flag indicating whether the XML file has already been loaded. + is_modified: A flag indicating whether any of the data in this collection has been modified after loading. """ id: str = field(converter=int_to_str) @@ -75,6 +76,7 @@ class Collection(SlottedDict[Volume]): validator=v.optional(v.instance_of(Event)), ) is_data_loaded: bool = field(init=False, repr=True, default=False) + is_modified: bool = field(init=False, repr=False, default=False) @id.validator def _check_id(self, _: Any, value: str) -> None: diff --git a/python/acl_anthology/sigs.py b/python/acl_anthology/sigs.py index eacbe3ad5b..b4f0bbace4 100644 --- a/python/acl_anthology/sigs.py +++ b/python/acl_anthology/sigs.py @@ -209,6 +209,11 @@ def load(self) -> None: self.is_data_loaded = True + def save(self) -> None: + """Save all SIG metadata to `sigs/*.yaml` files.""" + for sig in self.values(): + sig.save() + def by_volume(self, volume: Volume | AnthologyID) -> list[SIG]: """Find SIGs associated with a volume.""" if not self.is_data_loaded: diff --git a/python/tests/anthology_test.py b/python/tests/anthology_test.py index 094019a652..0f6a249d3f 100644 --- a/python/tests/anthology_test.py +++ b/python/tests/anthology_test.py @@ -14,8 +14,12 @@ import pytest from lxml.etree import RelaxNG +from unittest.mock import patch + from acl_anthology import Anthology -from acl_anthology.people import Name +from acl_anthology.collections import Collection +from acl_anthology.people import PersonIndex, Name +from acl_anthology.venues import VenueIndex def test_instantiate(shared_datadir): @@ -176,3 +180,26 @@ def test_load_all(anthology): assert anthology.people.is_data_loaded assert anthology.sigs.is_data_loaded assert anthology.venues.is_data_loaded + + +def test_save_all(anthology): + anthology.load_all() + anthology.collections["J89"].is_modified = True + anthology.collections["2022.acl"].is_modified = True + with ( + patch.object(Collection, "save", autospec=True) as mock, + patch.object(PersonIndex, "save") as people_mock, + patch.object(VenueIndex, "save") as venue_mock, + pytest.warns(UserWarning), + ): + anthology.save_all() + + # Get the instances that called save (first argument is self) + called_instances = [call[0][0] for call in mock.call_args_list] + assert anthology.collections["J89"] in called_instances + assert anthology.collections["2022.acl"] in called_instances + assert mock.call_count == 2 # no other instances have called save + + # Check the others + people_mock.assert_called_once() + venue_mock.assert_called_once() From 2a73ec54ca8880eea30285cb4030cb06fde0a51a Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Wed, 20 Aug 2025 00:26:10 +0200 Subject: [PATCH 05/19] Add hook that tracks modifications to collection objects --- .../acl_anthology/collections/collection.py | 3 +++ python/acl_anthology/collections/event.py | 18 ++++++++++--- python/acl_anthology/collections/index.py | 1 + python/acl_anthology/collections/paper.py | 25 ++++++++++++----- python/acl_anthology/collections/volume.py | 20 +++++++++++--- python/acl_anthology/people/person.py | 10 +++---- python/acl_anthology/utils/attrs.py | 15 ++++++++++- python/tests/collections/collection_test.py | 7 +++++ .../tests/collections/collectionindex_test.py | 1 + python/tests/collections/event_test.py | 12 +++++++++ python/tests/collections/paper_test.py | 27 +++++++++++++++++++ python/tests/collections/volume_test.py | 25 +++++++++++++++++ python/tests/people/person_test.py | 6 +++++ 13 files changed, 151 insertions(+), 19 deletions(-) diff --git a/python/acl_anthology/collections/collection.py b/python/acl_anthology/collections/collection.py index 2ae4243593..ba622e8280 100644 --- a/python/acl_anthology/collections/collection.py +++ b/python/acl_anthology/collections/collection.py @@ -202,6 +202,7 @@ def create_volume( self.root.people._add_to_index(volume.editors, volume.full_id_tuple) self.data[id] = volume + self.is_modified = True return volume def create_event( @@ -244,6 +245,7 @@ def create_event( **kwargs, ) self.root.events._add_to_index(self.event) + self.is_modified = True return self.event def load(self) -> None: @@ -297,6 +299,7 @@ def load(self) -> None: ] + self.event.colocated_ids self.is_data_loaded = True + self.is_modified = False def save(self, path: Optional[StrPath] = None, minimal_diff: bool = True) -> None: """Saves this collection as an XML file. diff --git a/python/acl_anthology/collections/event.py b/python/acl_anthology/collections/event.py index 3557de023c..3d6cce5614 100644 --- a/python/acl_anthology/collections/event.py +++ b/python/acl_anthology/collections/event.py @@ -15,7 +15,7 @@ from __future__ import annotations -from attrs import define, field, converters, validators as v +from attrs import define, field, converters, setters, validators as v from lxml import etree from lxml.builder import E from typing import Any, Iterator, Optional, TYPE_CHECKING @@ -25,7 +25,7 @@ from ..files import EventFileReference from ..people import NameSpecification from ..text import MarkupText, to_markuptext -from ..utils.attrs import auto_validate_types +from ..utils.attrs import auto_validate_types, track_modifications from ..utils.ids import AnthologyID, AnthologyIDTuple, parse_id, build_id_from_tuple if TYPE_CHECKING: @@ -87,7 +87,10 @@ def to_xml(self) -> etree._Element: return elem -@define(field_transformer=auto_validate_types) +@define( + field_transformer=auto_validate_types, + on_setattr=[setters.convert, setters.validate, track_modifications], +) class Event: """An event, such as a meeting or a conference. @@ -112,7 +115,7 @@ class Event: id: str = field(validator=v.matches_re(RE_EVENT_ID)) parent: Collection = field(repr=False, eq=False) - is_explicit: bool = field(default=False, converter=bool) + is_explicit: bool = field(default=False, converter=bool) # TODO: freeze? colocated_ids: list[tuple[AnthologyIDTuple, EventLinkingType]] = field( factory=list, @@ -134,6 +137,11 @@ class Event: location: Optional[str] = field(default=None) dates: Optional[str] = field(default=None) + @property + def collection(self) -> Collection: + """The collection this event belongs to.""" + return self.parent + @property def collection_id(self) -> str: """The collection ID this event belongs to.""" @@ -185,6 +193,8 @@ def add_colocated( return self.colocated_ids.append((volume_id, type_)) + if type_ == EventLinkingType.EXPLICIT: + self.collection.is_modified = True # Update the event index as well if self.root.events.is_data_loaded: diff --git a/python/acl_anthology/collections/index.py b/python/acl_anthology/collections/index.py index b6a168f5ae..b6d805eaf4 100644 --- a/python/acl_anthology/collections/index.py +++ b/python/acl_anthology/collections/index.py @@ -91,6 +91,7 @@ def create(self, id: str) -> Collection: path=self.parent.datadir / "xml" / f"{id}.xml", ) collection.is_data_loaded = True + collection.is_modified = True self.data[id] = collection return collection diff --git a/python/acl_anthology/collections/paper.py b/python/acl_anthology/collections/paper.py index 7945549489..c5ef86c884 100644 --- a/python/acl_anthology/collections/paper.py +++ b/python/acl_anthology/collections/paper.py @@ -15,7 +15,7 @@ from __future__ import annotations import attrs -from attrs import define, field, converters, validators as v +from attrs import define, field, converters, setters, validators as v import datetime from functools import cached_property import langcodes @@ -35,7 +35,12 @@ ) from ..people import NameSpecification from ..text import MarkupText, to_markuptext -from ..utils.attrs import auto_validate_types, date_to_str, int_to_str +from ..utils.attrs import ( + auto_validate_types, + date_to_str, + int_to_str, + track_modifications, +) from ..utils.citation import citeproc_render_html, render_acl_citation from ..utils.ids import build_id, is_valid_item_id, AnthologyIDTuple from ..utils.latex import make_bibtex_entry @@ -45,7 +50,7 @@ if TYPE_CHECKING: from ..anthology import Anthology from ..utils.latex import SerializableAsBibTeX - from . import Event, Volume + from . import Event, Volume, Collection log = get_logger() @@ -215,7 +220,10 @@ def _update_bibkey_index(paper: Paper, attr: attrs.Attribute[Any], value: str) - return value -@define(field_transformer=auto_validate_types) +@define( + field_transformer=auto_validate_types, + on_setattr=[setters.convert, setters.validate, track_modifications], +) class Paper: """A paper entry. @@ -255,7 +263,7 @@ class Paper: id: str = field(converter=int_to_str) # validator defined below parent: Volume = field(repr=False, eq=False) bibkey: str = field( - on_setattr=attrs.setters.pipe(attrs.setters.validate, _update_bibkey_index), + on_setattr=[setters.validate, track_modifications, _update_bibkey_index], ) title: MarkupText = field(converter=to_markuptext) @@ -308,7 +316,7 @@ class Paper: note: Optional[str] = field(default=None, repr=False) pages: Optional[str] = field(default=None, repr=False) paperswithcode: Optional[PapersWithCodeReference] = field( - default=None, on_setattr=attrs.setters.frozen, repr=False + default=None, on_setattr=setters.frozen, repr=False ) pdf: Optional[PDFReference] = field(default=None, repr=False) type: PaperType = field(default=PaperType.PAPER, repr=False, converter=PaperType) @@ -318,6 +326,11 @@ def _check_id(self, _: Any, value: str) -> None: if not is_valid_item_id(value): raise AnthologyInvalidIDError(value, "not a valid paper ID") + @property + def collection(self) -> Collection: + """The collection this paper belongs to.""" + return self.parent.parent + @property def collection_id(self) -> str: """The collection ID this paper belongs to.""" diff --git a/python/acl_anthology/collections/volume.py b/python/acl_anthology/collections/volume.py index f50b573e30..1434397bd2 100644 --- a/python/acl_anthology/collections/volume.py +++ b/python/acl_anthology/collections/volume.py @@ -15,7 +15,7 @@ from __future__ import annotations import datetime -from attrs import define, field, converters, validators +from attrs import define, field, converters, setters, validators from lxml import etree from lxml.builder import E from typing import Any, Iterator, Optional, cast, TYPE_CHECKING @@ -29,7 +29,12 @@ from ..people import NameSpecification from ..text import MarkupText, to_markuptext from ..venues import Venue -from ..utils.attrs import auto_validate_types, date_to_str, int_to_str +from ..utils.attrs import ( + auto_validate_types, + date_to_str, + int_to_str, + track_modifications, +) from ..utils.ids import build_id, is_valid_item_id, AnthologyIDTuple from .paper import Paper from .types import VolumeType @@ -40,7 +45,10 @@ from . import Collection, Event -@define(field_transformer=auto_validate_types) +@define( + field_transformer=auto_validate_types, + on_setattr=[setters.convert, setters.validate, track_modifications], +) class Volume(SlottedDict[Paper]): """A publication volume. @@ -117,6 +125,11 @@ def frontmatter(self) -> Paper | None: """Returns the volume's frontmatter, if any.""" return self.data.get(constants.FRONTMATTER_ID) + @property + def collection(self) -> Collection: + """The collection this volume belongs to.""" + return self.parent + @property def collection_id(self) -> str: """The collection ID this volume belongs to.""" @@ -295,6 +308,7 @@ def create_paper( self.root.people._add_to_index(paper.editors, paper.full_id_tuple) self.data[id] = paper + self.parent.is_modified = True return paper def _add_paper_from_xml(self, element: etree._Element) -> None: diff --git a/python/acl_anthology/people/person.py b/python/acl_anthology/people/person.py index 0b01bb3919..36a545989b 100644 --- a/python/acl_anthology/people/person.py +++ b/python/acl_anthology/people/person.py @@ -15,7 +15,7 @@ from __future__ import annotations import attrs -from attrs import define, field +from attrs import define, field, setters from enum import Enum import itertools as it from typing import Any, Iterator, Optional, Sequence, TYPE_CHECKING @@ -88,9 +88,7 @@ class Person: is_explicit: If True, this person's ID is explicitly defined in `people.yaml`. You probably want to use [`make_explicit()`][acl_anthology.people.person.Person.make_explicit] rather than change this attribute. """ - id: str = field( - on_setattr=attrs.setters.pipe(attrs.setters.validate, _update_person_index) - ) + id: str = field(on_setattr=[setters.validate, _update_person_index]) parent: Anthology = field(repr=False, eq=False) _names: list[tuple[Name, NameLink]] = field( factory=list, converter=_name_list_converter @@ -100,7 +98,7 @@ class Person: ) orcid: Optional[str] = field( default=None, - on_setattr=attrs.setters.pipe(attrs.setters.validate, _update_person_index), + on_setattr=[setters.validate, _update_person_index], ) # validator defined below comment: Optional[str] = field(default=None) degree: Optional[str] = field(default=None) @@ -259,10 +257,12 @@ def namespec_refers_to_self(namespec: NameSpecification) -> bool: for namespec in it.chain(paper.authors, paper.editors): if namespec_refers_to_self(namespec): namespec.id = new_id + paper.collection.is_modified = True for volume in self.volumes(): for namespec in volume.editors: if namespec_refers_to_self(namespec): namespec.id = new_id + volume.collection.is_modified = True def papers(self) -> Iterator[Paper]: """Returns an iterator over all papers associated with this person. diff --git a/python/acl_anthology/utils/attrs.py b/python/acl_anthology/utils/attrs.py index a5da86e7d2..55be784894 100644 --- a/python/acl_anthology/utils/attrs.py +++ b/python/acl_anthology/utils/attrs.py @@ -19,16 +19,29 @@ import attrs from attrs import validators import datetime -from typing import Any, Callable, Optional +from typing import Any, Callable, Optional, TypeVar, TYPE_CHECKING import re from .ids import AnthologyIDTuple +if TYPE_CHECKING: + from ..collections import Paper, Volume, Event + RE_WRAPPED_TYPE = re.compile(r"^([^\[]*)\[(.*)\]$") +T = TypeVar("T") + + +def track_modifications( + obj: Paper | Volume | Event, attr: attrs.Attribute[Any], value: T +) -> T: + if attr.name != "parent": + obj.collection.is_modified = True + return value def validate_anthology_id_tuple(cls: Any, attr: attrs.Attribute[Any], value: Any) -> None: + """Validate that an AnthologyIDTuple has the correct format.""" if ( isinstance(value, tuple) and len(value) == 3 diff --git a/python/tests/collections/collection_test.py b/python/tests/collections/collection_test.py index 205b39a88c..4ac7d71c63 100644 --- a/python/tests/collections/collection_test.py +++ b/python/tests/collections/collection_test.py @@ -83,6 +83,7 @@ def test_collection_load( assert collection.get_event() is not None else: assert collection.get_event() is None + assert not collection.is_modified @pytest.mark.filterwarnings( @@ -141,10 +142,12 @@ def test_collection_roundtrip_save( def test_collection_create_volume_implicit(collection_index): collection = collection_index.get("2022.acl") + assert not collection.is_modified volume = collection.create_volume( "keynotes", title="Keynotes from ACL 2022", ) + assert collection.is_modified assert volume.id in collection assert volume.year == "2022" assert volume.id == "keynotes" @@ -155,6 +158,7 @@ def test_collection_create_volume_implicit(collection_index): def test_collection_create_volume_explicit(collection_index): collection = collection_index.get("1989.cl") + assert not collection.is_modified volume = collection.create_volume( id="99", title=MarkupText.from_string("Special Issue"), @@ -163,6 +167,7 @@ def test_collection_create_volume_explicit(collection_index): journal_issue="99", venue_ids=["cl"], ) + assert collection.is_modified assert volume.id in collection assert volume.year == "1989" assert volume.id == "99" @@ -344,9 +349,11 @@ def test_collection_create_event_oldstyle_ids(collection_index): def test_collection_create_event_newstyle_ids(collection_index): collection = collection_index.get("1989.cl") + assert not collection.is_modified # For new-style ID collections, an explicit event ID is not required event = collection.create_event() + assert collection.is_modified assert event.id == "cl-1989" # Trying to create yet another event in the same collection should raise diff --git a/python/tests/collections/collectionindex_test.py b/python/tests/collections/collectionindex_test.py index b3f6a92adb..cdb58701a4 100644 --- a/python/tests/collections/collectionindex_test.py +++ b/python/tests/collections/collectionindex_test.py @@ -39,6 +39,7 @@ def test_collectionindex_create_collection(anthology_stub): assert collection.id == "2099.acl" assert collection.parent is index assert collection.id in index + assert collection.is_modified def test_collectionindex_create_collection_should_raise_with_oldstyle_ids(anthology_stub): diff --git a/python/tests/collections/event_test.py b/python/tests/collections/event_test.py index 58a8a81c5d..69cbb95cdb 100644 --- a/python/tests/collections/event_test.py +++ b/python/tests/collections/event_test.py @@ -152,9 +152,20 @@ def test_event_volumes(anthology): list(anthology.events.get("acl-2022").volumes()) +@pytest.mark.parametrize( + "attr_name", ("id", "colocated_ids", "links", "talks", "title", "location", "dates") +) +def test_event_setattr_sets_collection_is_modified(anthology, attr_name): + event = anthology.events.get("lrec-2006") + assert not event.collection.is_modified + setattr(event, attr_name, getattr(event, attr_name)) + assert event.collection.is_modified + + def test_event_add_colocated(anthology): event = anthology.events.get("lrec-2006") assert len(event.colocated_ids) == 1 + assert not event.collection.is_modified volume = anthology.get_volume("J89-1") # Adding colocated volume should update Event & EventIndex @@ -162,6 +173,7 @@ def test_event_add_colocated(anthology): assert len(event.colocated_ids) == 2 assert (volume.full_id_tuple, EventLinkingType.EXPLICIT) in event.colocated_ids assert event in anthology.events.by_volume(volume) + assert event.collection.is_modified # Adding the same volume a second time shouldn't change anything event.add_colocated(volume) diff --git a/python/tests/collections/paper_test.py b/python/tests/collections/paper_test.py index 99f1d55a82..b6e116e851 100644 --- a/python/tests/collections/paper_test.py +++ b/python/tests/collections/paper_test.py @@ -33,10 +33,15 @@ ) +class CollectionStub: + is_modified = False + + class VolumeStub: title = MarkupText.from_string("Generic volume") editors = [] full_id_tuple = ("2099", "stub", None) + parent = CollectionStub() @pytest.fixture @@ -131,6 +136,28 @@ def test_paper_change_id(anthology): paper.id = "1" +@pytest.mark.parametrize( + "attr_name", + ( + "id", + "bibkey", + "title", + "attachments", + "authors", + "awards", + "abstract", + "doi", + "ingest_date", + "type", + ), +) +def test_paper_setattr_sets_collection_is_modified(anthology, attr_name): + paper = anthology.get_paper("2022.acl-long.48") + assert not paper.collection.is_modified + setattr(paper, attr_name, getattr(paper, attr_name)) + assert paper.collection.is_modified + + test_cases_language = ( ("2022.acl-short.11", None, None), ("2022.naloma-1.3", "fra", "French"), diff --git a/python/tests/collections/volume_test.py b/python/tests/collections/volume_test.py index d9aff7cd22..cf9401a737 100644 --- a/python/tests/collections/volume_test.py +++ b/python/tests/collections/volume_test.py @@ -235,6 +235,27 @@ def test_volume_set_ingest_date(anthology): assert volume.ingest_date == "2026-03-01" +@pytest.mark.parametrize( + "attr_name", + ( + "id", + "title", + "year", + "editors", + "venue_ids", + "address", + "ingest_date", + "pdf", + "shorttitle", + ), +) +def test_volume_setattr_sets_collection_is_modified(anthology, attr_name): + volume = anthology.get_volume("2022.acl-long") + assert not volume.parent.is_modified + setattr(volume, attr_name, getattr(volume, attr_name)) + assert volume.parent.is_modified + + def test_volume_venues_j89(anthology): volume = anthology.get_volume("J89-1") assert volume.venue_ids == ["cl"] @@ -342,12 +363,14 @@ def test_volume_generate_paper_id(anthology): def test_volume_create_paper_implicit(anthology): volume = anthology.get_volume("2022.acl-long") + assert not volume.collection.is_modified authors = [NameSpec("Bollmann, Marcel")] paper = volume.create_paper( title="The awesome paper I have never written", authors=authors, ingest_date="2025-01-07", ) + assert volume.collection.is_modified assert paper.authors == authors assert paper.title.as_text() == "The awesome paper I have never written" assert paper.ingest_date == "2025-01-07" @@ -362,6 +385,7 @@ def test_volume_create_paper_implicit(anthology): def test_volume_create_paper_explicit(anthology): volume = anthology.get_volume("2022.acl-long") + assert not volume.collection.is_modified authors = [NameSpec("Bollmann, Marcel")] paper = volume.create_paper( title="The awesome paper I have never written", @@ -370,6 +394,7 @@ def test_volume_create_paper_explicit(anthology): id="701", bibkey="bollmann-2022-the-awesome", ) + assert volume.collection.is_modified assert paper.authors == authors assert paper.title.as_text() == "The awesome paper I have never written" assert paper.ingest_date == "2025-01-07" diff --git a/python/tests/people/person_test.py b/python/tests/people/person_test.py index ec1c376da0..bc8cd6aa36 100644 --- a/python/tests/people/person_test.py +++ b/python/tests/people/person_test.py @@ -158,6 +158,7 @@ def test_person_update_id_should_update_connected_papers(anthology): namespec = anthology.get(person.item_ids[0]).authors[-1] assert namespec.name == Name("Yang", "Liu") assert namespec.id == "yang-liu-new" + assert anthology.collections["2022.acl"].is_modified def test_person_cannot_update_id_when_inferred(anthology): @@ -197,6 +198,11 @@ def test_person_make_explicit(anthology): person.make_explicit("nicoletta-calzolari") assert person.is_explicit assert person.id == "nicoletta-calzolari" + # IDs have been added to these collections: + assert anthology.collections["J89"].is_modified + assert anthology.collections["L06"].is_modified + # But not this one: + assert not anthology.collections["2022.acl"].is_modified def test_person_make_explicit_should_raise_when_explicit(anthology): From dbe3727571f8a0565601e90a4aa9d70e7babe394 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Wed, 20 Aug 2025 00:49:12 +0200 Subject: [PATCH 06/19] Update documentation regarding save_all() functionality --- python/docs/guide/modifying-data.md | 56 +++++++++++++++++++---------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/python/docs/guide/modifying-data.md b/python/docs/guide/modifying-data.md index 7de425b599..e88ef5b4e7 100644 --- a/python/docs/guide/modifying-data.md +++ b/python/docs/guide/modifying-data.md @@ -12,16 +12,18 @@ the Anthology.** The implementation of this is complicated by the various indices and objects such as persons that cross-reference other objects. Here are some rules of thumb when making modifications to the data: -1. To create new objects, use `create_` functions provided by the library +1. To **create new objects**, use `create_` functions provided by the library whenever possible, rather than instantiating them directly. -2. You can modify objects by simply modifying their attributes, as long as the - object in question has an explicit representation in the Anthology data. +2. You can modify objects by simply **modifying their attributes**, as long as + the object in question has an explicit representation in the Anthology data. - This includes collections, volumes, papers, events. - It also includes persons where `Person.is_explicit == True`, as those have an explicit representation in `people.yaml`. -3. Saving data is always non-destructive and will avoid introducing unnecessary - changes (e.g. no needless reordering of XML tags). {==This is currently only - true & tested for XML files, not for the YAML files.==} +3. **Saving data is always non-destructive**. In XML files, it will also avoid + introducing unnecessary changes (e.g. no needless reordering of tags). The + only exception to this is saving SIG YAML files, as they currently frequently + contain comments, which will be lost when saving these files through the + library. 4. If you need to refer to indices such as [PersonIndex][acl_anthology.people.index.PersonIndex], [EventIndex][acl_anthology.collections.eventindex.EventIndex], or @@ -341,17 +343,35 @@ Volumes can be connected to venues by modifying the volume's `venue_ids` list. { ## Saving changes -- **Changes to a collection/volume/paper** can be saved by calling - [`Collection.save()`][acl_anthology.collections.collection.Collection.save]. - This will use a [minimal-diff - algorithm][acl_anthology.utils.xml.ensure_minimal_diff] by default to avoid - introducing "noise" in the diffs, i.e. changes to the XML that do not make a - semantic difference, such as reordering certain tags, attributes, or - introducing/deleting certain empty tags. It is also guaranteed to be - non-destructive through [integration tests on the entire Anthology - data](https://github.com/acl-org/acl-anthology/blob/master/python/tests/anthology_integration_test.py). +!!! tip "Rule of thumb" + + Call [`anthology.save_all()`][acl_anthology.anthology.Anthology.save_all] to save all metadata changes. + +Calling [`save_all()`][acl_anthology.anthology.Anthology.save_all] will write +XML and YAML files to the Anthology's data directory, with the following +caveats: + +- **Collections will track if they have been modified** to prevent writing XML + files unnecessarily. As with modifying attributes in general, this requires + that you have _set_ an attribute; modifying attributes in-place, e.g. lists, + will not be detected. + + - Saving a collection manually can be done by calling + [`Collection.save()`][acl_anthology.collections.collection.Collection.save]. + + - Saving a collection uses a [minimal-diff + algorithm][acl_anthology.utils.xml.ensure_minimal_diff] by default to avoid + introducing "noise" in the diffs, i.e. changes to the XML that do not make a + semantic difference, such as reordering certain tags, attributes, or + introducing/deleting certain empty tags. It is also guaranteed to be + non-destructive through [integration tests on the entire Anthology + data](https://github.com/acl-org/acl-anthology/blob/master/python/tests/anthology_integration_test.py). -- **Changes to the person database (`people.yaml`)** can be saved by calling - [`PersonIndex.save()`][acl_anthology.people.index.PersonIndex.save]. +- **YAML files will always be written**. Serializing all YAML files is much + faster than serializing all XML files, so they are written unconditionally, + without tracking changes. -{==TODO: changes to other YAML files, `Anthology.save_all()`, etc. ==} +- **SIG YAML files are currently not written automatically**. This is because + the current format of the SIG YAML files is a bit arcane, and existing files + use a lot of comments, which would be deleted upon writing these files. + {==This may change in the future.==} From 3c824439c93e9b036798164ab1f4e8d3aea06a4a Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Wed, 20 Aug 2025 00:53:33 +0200 Subject: [PATCH 07/19] Rename EventLinkingType -> EventLink, analogous to NameLink --- python/acl_anthology/collections/__init__.py | 4 +-- .../acl_anthology/collections/collection.py | 6 ++-- python/acl_anthology/collections/event.py | 17 +++++------ .../acl_anthology/collections/eventindex.py | 6 ++-- python/acl_anthology/collections/types.py | 2 +- python/tests/collections/collection_test.py | 8 +++--- python/tests/collections/event_test.py | 18 ++++++------ python/tests/collections/eventindex_test.py | 28 +++++++++---------- 8 files changed, 42 insertions(+), 47 deletions(-) diff --git a/python/acl_anthology/collections/__init__.py b/python/acl_anthology/collections/__init__.py index 1e55c537ce..aa156995b7 100644 --- a/python/acl_anthology/collections/__init__.py +++ b/python/acl_anthology/collections/__init__.py @@ -18,7 +18,7 @@ from .eventindex import EventIndex from .event import Event, Talk from .volume import Volume -from .types import EventLinkingType, PaperDeletionType, PaperType, VolumeType +from .types import EventLink, PaperDeletionType, PaperType, VolumeType from .paper import Paper @@ -28,7 +28,7 @@ "CollectionIndex", "Event", "EventIndex", - "EventLinkingType", + "EventLink", "Paper", "PaperDeletionType", "PaperType", diff --git a/python/acl_anthology/collections/collection.py b/python/acl_anthology/collections/collection.py index ba622e8280..ab02385527 100644 --- a/python/acl_anthology/collections/collection.py +++ b/python/acl_anthology/collections/collection.py @@ -33,7 +33,7 @@ from ..utils.logging import get_logger from ..utils import xml from .event import Event -from .types import EventLinkingType, VolumeType +from .types import EventLink, VolumeType from .volume import Volume from .paper import Paper @@ -290,11 +290,11 @@ def load(self) -> None: if self.event is not None: # Events are implicitly linked to volumes defined in the same collection self.event.colocated_ids = [ - (volume.full_id_tuple, EventLinkingType.INFERRED) + (volume.full_id_tuple, EventLink.INFERRED) for volume in self.data.values() # Edge case: in case the block lists a volume in # the same collection, don't add it twice - if (volume.full_id_tuple, EventLinkingType.EXPLICIT) + if (volume.full_id_tuple, EventLink.EXPLICIT) not in self.event.colocated_ids ] + self.event.colocated_ids diff --git a/python/acl_anthology/collections/event.py b/python/acl_anthology/collections/event.py index 3d6cce5614..4b21482199 100644 --- a/python/acl_anthology/collections/event.py +++ b/python/acl_anthology/collections/event.py @@ -20,7 +20,7 @@ from lxml.builder import E from typing import Any, Iterator, Optional, TYPE_CHECKING -from .types import EventLinkingType +from .types import EventLink from ..constants import RE_EVENT_ID from ..files import EventFileReference from ..people import NameSpecification @@ -117,7 +117,7 @@ class Event: parent: Collection = field(repr=False, eq=False) is_explicit: bool = field(default=False, converter=bool) # TODO: freeze? - colocated_ids: list[tuple[AnthologyIDTuple, EventLinkingType]] = field( + colocated_ids: list[tuple[AnthologyIDTuple, EventLink]] = field( factory=list, repr=lambda x: f"", ) @@ -166,7 +166,7 @@ def volumes(self) -> Iterator[Volume]: def add_colocated( self, volume: Volume | AnthologyID, - type_: EventLinkingType = EventLinkingType.EXPLICIT, + type_: EventLink = EventLink.EXPLICIT, ) -> None: """Add a co-located volume to this event. @@ -185,15 +185,12 @@ def add_colocated( for idx, (existing_id, existing_type) in enumerate(self.colocated_ids): if volume_id == existing_id: - if ( - existing_type == EventLinkingType.INFERRED - and type_ == EventLinkingType.EXPLICIT - ): + if existing_type == EventLink.INFERRED and type_ == EventLink.EXPLICIT: self.colocated_ids[idx] = (volume_id, type_) return self.colocated_ids.append((volume_id, type_)) - if type_ == EventLinkingType.EXPLICIT: + if type_ == EventLink.EXPLICIT: self.collection.is_modified = True # Update the event index as well @@ -225,7 +222,7 @@ def from_xml(cls, parent: Collection, event: etree._Element) -> Event: kwargs["talks"].append(Talk.from_xml(element)) elif element.tag == "colocated": kwargs["colocated_ids"] = [ - (parse_id(str(volume_id.text)), EventLinkingType.EXPLICIT) + (parse_id(str(volume_id.text)), EventLink.EXPLICIT) for volume_id in element if volume_id.tag == "volume-id" ] @@ -264,7 +261,7 @@ def to_xml(self) -> etree._Element: if self.colocated_ids: colocated = E.colocated() for id_tuple, el_type in self.colocated_ids: - if el_type == EventLinkingType.EXPLICIT: + if el_type == EventLink.EXPLICIT: colocated.append( getattr(E, "volume-id")(build_id_from_tuple(id_tuple)) ) diff --git a/python/acl_anthology/collections/eventindex.py b/python/acl_anthology/collections/eventindex.py index 95e2a51369..3951cba6ea 100644 --- a/python/acl_anthology/collections/eventindex.py +++ b/python/acl_anthology/collections/eventindex.py @@ -24,7 +24,7 @@ from ..utils.ids import AnthologyID, AnthologyIDTuple, parse_id from ..utils.logging import get_logger from .event import Event -from .types import EventLinkingType +from .types import EventLink from .volume import Volume if TYPE_CHECKING: @@ -124,12 +124,12 @@ def load(self) -> None: event_id, collection, is_explicit=False, - colocated_ids=[(volume_fid, EventLinkingType.INFERRED)], + colocated_ids=[(volume_fid, EventLink.INFERRED)], title=MarkupText.from_string(event_name), ) else: # Add implicit connection to existing event - event.add_colocated(volume_fid, EventLinkingType.INFERRED) + event.add_colocated(volume_fid, EventLink.INFERRED) self.reverse[volume_fid].add(event_id) except Exception as exc: log.exception(exc) diff --git a/python/acl_anthology/collections/types.py b/python/acl_anthology/collections/types.py index fe5e5e844f..6418914999 100644 --- a/python/acl_anthology/collections/types.py +++ b/python/acl_anthology/collections/types.py @@ -15,7 +15,7 @@ from enum import Enum -class EventLinkingType(Enum): +class EventLink(Enum): """How a volume ID was connected to an Event.""" EXPLICIT = "explicit" diff --git a/python/tests/collections/collection_test.py b/python/tests/collections/collection_test.py index 4ac7d71c63..b6fb6665f5 100644 --- a/python/tests/collections/collection_test.py +++ b/python/tests/collections/collection_test.py @@ -21,7 +21,7 @@ from acl_anthology.collections import ( Collection, CollectionIndex, - EventLinkingType, + EventLink, VolumeType, ) from acl_anthology.people import NameSpecification @@ -265,7 +265,7 @@ def test_collection_create_volume_should_create_event(anthology, pre_load, reset # New implicit event should exist in the event index assert "acl-2000" in anthology.events - assert (volume.full_id_tuple, EventLinkingType.INFERRED) in anthology.events[ + assert (volume.full_id_tuple, EventLink.INFERRED) in anthology.events[ "acl-2000" ].colocated_ids assert volume.full_id_tuple in anthology.events.reverse @@ -298,7 +298,7 @@ def test_collection_create_volume_should_update_event(anthology, pre_load, reset # New volume should be added to existing event assert "acl-2022" in anthology.events - assert (volume.full_id_tuple, EventLinkingType.INFERRED) in anthology.events[ + assert (volume.full_id_tuple, EventLink.INFERRED) in anthology.events[ "acl-2022" ].colocated_ids assert volume.full_id_tuple in anthology.events.reverse @@ -372,7 +372,7 @@ def test_collection_create_event_should_update_eventindex(pre_load, anthology): if pre_load: # Volume should automatically have been added assert event.colocated_ids == [ - (collection.get("1").full_id_tuple, EventLinkingType.INFERRED) + (collection.get("1").full_id_tuple, EventLink.INFERRED) ] else: # If event index wasn't loaded, it's not diff --git a/python/tests/collections/event_test.py b/python/tests/collections/event_test.py index 69cbb95cdb..72e751c701 100644 --- a/python/tests/collections/event_test.py +++ b/python/tests/collections/event_test.py @@ -16,7 +16,7 @@ from attrs import define from lxml import etree -from acl_anthology.collections import Event, EventLinkingType, Talk +from acl_anthology.collections import Event, EventLink, Talk from acl_anthology.files import EventFileReference from acl_anthology.text import MarkupText from acl_anthology.utils.xml import indent @@ -91,9 +91,9 @@ def test_event_all_attribs(): location="Online", dates="August 17-19, 2023", colocated_ids=[ - (("2023.foobar", "1", None), EventLinkingType.EXPLICIT), - (("2023.baz", "1", None), EventLinkingType.EXPLICIT), - (("2023.asdf", "1", None), EventLinkingType.EXPLICIT), + (("2023.foobar", "1", None), EventLink.EXPLICIT), + (("2023.baz", "1", None), EventLink.EXPLICIT), + (("2023.asdf", "1", None), EventLink.EXPLICIT), ], talks=[Talk("Invited talk")], links={"Website": EventFileReference("http://foobar.com")}, @@ -108,10 +108,10 @@ def test_event_to_xml_dont_list_colocated_volumes_of_parent(): id="li-2023", parent=CollectionStub("2023.li"), colocated_ids=[ - (("2023.baz", "1", None), EventLinkingType.EXPLICIT), - (("2023.li", "main", None), EventLinkingType.INFERRED), - (("2023.li", "side", None), EventLinkingType.INFERRED), - (("2023.ling", "1", None), EventLinkingType.EXPLICIT), + (("2023.baz", "1", None), EventLink.EXPLICIT), + (("2023.li", "main", None), EventLink.INFERRED), + (("2023.li", "side", None), EventLink.INFERRED), + (("2023.ling", "1", None), EventLink.EXPLICIT), ], ) out = event.to_xml() @@ -171,7 +171,7 @@ def test_event_add_colocated(anthology): # Adding colocated volume should update Event & EventIndex event.add_colocated(volume) assert len(event.colocated_ids) == 2 - assert (volume.full_id_tuple, EventLinkingType.EXPLICIT) in event.colocated_ids + assert (volume.full_id_tuple, EventLink.EXPLICIT) in event.colocated_ids assert event in anthology.events.by_volume(volume) assert event.collection.is_modified diff --git a/python/tests/collections/eventindex_test.py b/python/tests/collections/eventindex_test.py index 0f7e10a377..8f41781b05 100644 --- a/python/tests/collections/eventindex_test.py +++ b/python/tests/collections/eventindex_test.py @@ -13,7 +13,7 @@ # limitations under the License. import pytest -from acl_anthology.collections import EventIndex, EventLinkingType +from acl_anthology.collections import EventIndex, EventLink def test_all_defined_events(anthology): @@ -40,9 +40,7 @@ def test_implicit_event_data(anthology): ) assert event.location is None assert event.dates is None - assert event.colocated_ids == [ - (("2022.naloma", "1", None), EventLinkingType.INFERRED) - ] + assert event.colocated_ids == [(("2022.naloma", "1", None), EventLink.INFERRED)] def test_implicit_and_explicit_event_data(anthology): @@ -52,8 +50,8 @@ def test_implicit_and_explicit_event_data(anthology): assert event.location is None assert event.dates is None assert event.colocated_ids == [ - (("2022.nonexistant", "1", None), EventLinkingType.EXPLICIT), - (("2022.naloma", "1", None), EventLinkingType.INFERRED), + (("2022.nonexistant", "1", None), EventLink.EXPLICIT), + (("2022.naloma", "1", None), EventLink.INFERRED), ] @@ -75,15 +73,15 @@ def test_explicit_event_data(anthology): assert event.location == "Dublin, Ireland" assert event.dates == "May 22–27, 2022" assert event.colocated_ids == [ - (("2022.acl", "long", None), EventLinkingType.INFERRED), - (("2022.acl", "short", None), EventLinkingType.INFERRED), - (("2022.acl", "srw", None), EventLinkingType.INFERRED), - (("2022.acl", "demo", None), EventLinkingType.INFERRED), - (("2022.acl", "tutorials", None), EventLinkingType.INFERRED), - (("2022.findings", "acl", None), EventLinkingType.EXPLICIT), - (("2022.bigscience", "1", None), EventLinkingType.EXPLICIT), - (("2022.naloma", "1", None), EventLinkingType.EXPLICIT), - (("2022.wit", "1", None), EventLinkingType.EXPLICIT), + (("2022.acl", "long", None), EventLink.INFERRED), + (("2022.acl", "short", None), EventLink.INFERRED), + (("2022.acl", "srw", None), EventLink.INFERRED), + (("2022.acl", "demo", None), EventLink.INFERRED), + (("2022.acl", "tutorials", None), EventLink.INFERRED), + (("2022.findings", "acl", None), EventLink.EXPLICIT), + (("2022.bigscience", "1", None), EventLink.EXPLICIT), + (("2022.naloma", "1", None), EventLink.EXPLICIT), + (("2022.wit", "1", None), EventLink.EXPLICIT), ] From 86819697c5b70a977c37b020e97fed0ba18e9bda Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Wed, 20 Aug 2025 01:01:09 +0200 Subject: [PATCH 08/19] Update CHANGELOG --- python/CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index 8bcb077605..8ead837635 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -6,6 +6,7 @@ This release implements the new [name resolution and author ID logic](https://gi ### Added +- Anthology now provides `save_all()` to conveniently save all data files. The library tracks modifications to collection objects to only write XML files that have actually changed. - NameSpecification now provides an `orcid` field. - Person: - Now provides `orcid`, `degree`, `disable_name_matching`, and `similar_ids` fields that correspond to the respective fields in the new `people.yaml`. @@ -17,6 +18,8 @@ This release implements the new [name resolution and author ID logic](https://gi - Now also keeps a mapping of name slugs to (verified) person IDs, via `slugs_to_verified_ids` (mostly for internal use). - Added `ingest_namespec()` to implement the [matching logic on ingestion](https://github.com/acl-org/acl-anthology/wiki/Author-Page-Plan#ingestion) of new volumes. - Added `create_person()` to instantiate a new Person and add it to the index. +- MarkupText now provides a `from_()` class method that calls the appropriate builder method, using heuristic markup parsing if instantiated from a string. +- MarkupText now supports some common string methods, such as `__contains__`, `endswith`, `startswith`. ### Changed @@ -28,6 +31,8 @@ This release implements the new [name resolution and author ID logic](https://gi - Changed the previously experimental `save()` function to serialize the `people.yaml` file. - Person now stores names as tuples of `(Name, NameLink)`, the latter of which indicates if the name was explicitly defined in `people.yaml` or inferred by the name resolution logic (e.g. via slug matching). As a consequence, `Person.names` can no longer be modified in-place; use `Person.add_name()`, `Person.remove_name()`, or the setter of `Person.names`. - Setting a canonical name for a Person changed from `.set_canonical_name()` to `Person.canonical_name = ...` +- Attributes that expect a MarkupText, such as `Volume.title` or `Paper.abstract`, can now be set to a string, in which case the string will be automatically converted to MarkupText, including markup parsing. +- EventLinkingType renamed to EventLink. ## [0.5.3] — 2025-06-22 From aa4063d10189b9e83233fa948979e9b4cf6cb626 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Wed, 20 Aug 2025 13:11:38 +0200 Subject: [PATCH 09/19] Rename create_person -> create --- python/CHANGELOG.md | 2 +- python/acl_anthology/people/index.py | 2 +- python/docs/guide/modifying-data.md | 6 +++--- python/tests/people/personindex_test.py | 12 ++++++------ 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index 8ead837635..49ac07b965 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -17,7 +17,7 @@ This release implements the new [name resolution and author ID logic](https://gi - Now also indexes Person objects by ORCID, and provides `by_orcid` and `get_by_orcid()`. - Now also keeps a mapping of name slugs to (verified) person IDs, via `slugs_to_verified_ids` (mostly for internal use). - Added `ingest_namespec()` to implement the [matching logic on ingestion](https://github.com/acl-org/acl-anthology/wiki/Author-Page-Plan#ingestion) of new volumes. - - Added `create_person()` to instantiate a new Person and add it to the index. + - Added `PersonIndex.create` to instantiate a new Person and add it to the index. - MarkupText now provides a `from_()` class method that calls the appropriate builder method, using heuristic markup parsing if instantiated from a string. - MarkupText now supports some common string methods, such as `__contains__`, `endswith`, `startswith`. diff --git a/python/acl_anthology/people/index.py b/python/acl_anthology/people/index.py index 980909d464..c94b1d83ad 100644 --- a/python/acl_anthology/people/index.py +++ b/python/acl_anthology/people/index.py @@ -346,7 +346,7 @@ def add_person(self, person: Person) -> None: if is_verified_person_id(pid): self._slugs_to_verified_ids[name.slugify()].add(pid) - def create_person( + def create( self, id: str, names: list[Name], diff --git a/python/docs/guide/modifying-data.md b/python/docs/guide/modifying-data.md index e88ef5b4e7..cd37b2fb40 100644 --- a/python/docs/guide/modifying-data.md +++ b/python/docs/guide/modifying-data.md @@ -156,7 +156,7 @@ A person can be _explicit_ (has an entry in `people.yaml`) or _inferred_ (was in Manually creating a new person (that will get saved to `people.yaml` and can have an ORCID and other metadata) can be done in two ways: -1. By calling [`PersonIndex.create_person()`][acl_anthology.people.index.PersonIndex.create_person]. The returned Person is _not_ linked to any papers/volumes, but you can set their ID afterwards on name specifications. +1. By calling [`PersonIndex.create()`][acl_anthology.people.index.PersonIndex.create]. The returned Person is _not_ linked to any papers/volumes, but you can set their ID afterwards on name specifications. 2. By calling [`make_explicit()`][acl_anthology.people.person.Person.make_explicit] on a currently _inferred_ person. This will not only add this person to the database, but also **set their ID on all papers/volumes** currently associated with them. @@ -174,7 +174,7 @@ have an ORCID and other metadata) can be done in two ways: **Situation:** A person `p1` is currently associated with papers/volumes that actually belong to different people, who just happened to publish under the same name. We want to create a new person instance for the other author with the same name. -1. Call [`anthology.people.create_person()`][acl_anthology.people.index.PersonIndex.create_person] for all persons who do not have an explicit ID yet, giving all the names that can refer to this person. Also supply the ORCID when calling this function, if it is known. +1. Call [`anthology.people.create()`][acl_anthology.people.index.PersonIndex.create] for all persons who do not have an explicit ID yet, giving all the names that can refer to this person. Also supply the ORCID when calling this function, if it is known. 2. For each person, iterate through the papers that actually belong to them and update the name specification that currently resolves to `p1` by setting the explicit ID of the correct newly-created person. {==TODO: Same as above: It's currently a bit tricky to find the _name specification_ referring to a person; should add a function for this.==} @@ -290,7 +290,7 @@ NameSpecification(Name("Marcel", "Bollmann"), orcid="0000-0003-2598-8150") If an ORCID is supplied, the NameSpecification also needs to have an explicit ID referring to an entry in `people.yaml`. **The library can add an ID automatically** as long as you supply the author/editor list to the `create_` -function, so there is typically **no need to call `create_person()`** during +function, so there is typically **no need to call `create()`** during ingestion! !!! example diff --git a/python/tests/people/personindex_test.py b/python/tests/people/personindex_test.py index 87b9731f16..f23e8a1e0c 100644 --- a/python/tests/people/personindex_test.py +++ b/python/tests/people/personindex_test.py @@ -189,7 +189,7 @@ def test_change_orcid(index): def test_create_person(index): - person = index.create_person( + person = index.create( id="matt-post", names=[Name("Matt", "Post")], orcid="0000-0002-1297-6794", @@ -202,7 +202,7 @@ def test_create_person(index): def test_create_person_should_fail_on_duplicate_orcid(index): with pytest.raises(ValueError): - index.create_person( + index.create( id="marcel-bollmann-twin", names=[Name("Marcel", "Bollmann")], orcid="0000-0003-2598-8150", # already assigned to "marcel-bollmann" @@ -211,7 +211,7 @@ def test_create_person_should_fail_on_duplicate_orcid(index): def test_create_person_should_fail_on_duplicate_id(index): with pytest.raises(AnthologyInvalidIDError): - index.create_person( + index.create( id="marcel-bollmann", # already exists names=[Name("Marcel", "Bollmann")], ) @@ -219,7 +219,7 @@ def test_create_person_should_fail_on_duplicate_id(index): def test_create_person_should_fail_on_unverified_id(index): with pytest.raises(AnthologyInvalidIDError): - index.create_person( + index.create( id="unverified/john-doe", # cannot create this manually names=[Name("John", "Doe")], ) @@ -227,7 +227,7 @@ def test_create_person_should_fail_on_unverified_id(index): def test_create_person_should_fail_on_empty_names(index): with pytest.raises(ValueError): - index.create_person( + index.create( id="john-doe-new", names=[], # cannot be empty ) @@ -644,7 +644,7 @@ def test_add_person_to_people_yaml_via_create_person(index, tmp_path): yaml_out = tmp_path / "people.create_person.yaml" # Modifications - index.create_person( + index.create( id="preslav-nakov", names=[Name("Preslav", "Nakov")], orcid="0000-0002-3600-1510", From 8455e160157c9d922c6ba07d615070b506539fd9 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Wed, 20 Aug 2025 13:31:39 +0200 Subject: [PATCH 10/19] Add VenueIndex.create --- python/CHANGELOG.md | 1 + python/acl_anthology/constants.py | 3 +++ python/acl_anthology/venues.py | 37 +++++++++++++++++++++++++---- python/docs/guide/modifying-data.md | 6 ++++- python/tests/venues_test.py | 20 +++++++++++++++- 5 files changed, 61 insertions(+), 6 deletions(-) diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index 49ac07b965..9015d6b8d0 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -20,6 +20,7 @@ This release implements the new [name resolution and author ID logic](https://gi - Added `PersonIndex.create` to instantiate a new Person and add it to the index. - MarkupText now provides a `from_()` class method that calls the appropriate builder method, using heuristic markup parsing if instantiated from a string. - MarkupText now supports some common string methods, such as `__contains__`, `endswith`, `startswith`. +- Venues can now be created via `VenueIndex.create()`. ### Changed diff --git a/python/acl_anthology/constants.py b/python/acl_anthology/constants.py index 2b8ec9106b..7d85a323a9 100644 --- a/python/acl_anthology/constants.py +++ b/python/acl_anthology/constants.py @@ -29,5 +29,8 @@ RE_EVENT_ID = r"^[a-z0-9]+-[0-9]{4}$" """A regular expression matching a valid event ID.""" +RE_VENUE_ID = r"^[a-z][a-z0-9]+$" +"""A regular expression matching a valid venue ID.""" + RE_ISO_DATE = r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$" """A regular expression matching a date in ISO format.""" diff --git a/python/acl_anthology/venues.py b/python/acl_anthology/venues.py index 814647a914..5702dd3e53 100644 --- a/python/acl_anthology/venues.py +++ b/python/acl_anthology/venues.py @@ -16,7 +16,7 @@ from attrs import define, field, validators as v, asdict from pathlib import Path -from typing import Iterator, Optional, TYPE_CHECKING +from typing import Any, Iterator, Optional, TYPE_CHECKING import yaml try: @@ -26,6 +26,7 @@ from .utils.attrs import auto_validate_types from .utils.ids import AnthologyIDTuple, build_id_from_tuple +from .constants import RE_VENUE_ID from .containers import SlottedDict if TYPE_CHECKING: @@ -51,7 +52,7 @@ class Venue: url: A website URL for the venue. """ - id: str = field(converter=str) + id: str = field(converter=str, validator=v.matches_re(RE_VENUE_ID)) parent: Anthology = field(repr=False, eq=False) acronym: str = field(converter=str) name: str = field(converter=str) @@ -134,7 +135,7 @@ class VenueIndex(SlottedDict[Venue]): is_data_loaded: bool = field(init=False, repr=True, default=False) def load(self) -> None: - """Loads and parses the `venues/*.yaml` files. + """Load and parse the `venues/*.yaml` files. Raises: KeyError: If a mandatory key is missing in a YAML file. @@ -149,8 +150,36 @@ def load(self) -> None: self.build() self.is_data_loaded = True + def create(self, id: str, acronym: str, name: str, **kwargs: Any) -> Venue: + """Create a new venue and add it to the index. + + Parameters: + id: The ID of the new venue. + acronym: The acronym of the new venue. + name: The name of the new venue. + **kwargs: Any valid optional attribute of [Venue][acl_anthology.venues.Venue], with the exception of `path`, which is automatically set based on the venue ID, as well as `item_ids` and `oldstyle_letter`, which cannot be set. + + Returns: + The created [Venue][acl_anthology.venues.Venue] object. + + Raises: + KeyError: If an invalid attribute is supplied in `**kwargs`. + """ + if "item_ids" in kwargs: + raise KeyError( + "Cannot specify `item_ids` for Venue; add its ID to the volume(s) instead." + ) + if "oldstyle_letter" in kwargs: + raise KeyError("Cannot specify a new venue with an old-style letter.") + + kwargs["parent"] = self.parent + kwargs["path"] = self.parent.datadir / "yaml" / "venues" / f"{id}.yaml" + venue = Venue(id=id, acronym=acronym, name=name, **kwargs) + self.data[id] = venue + return venue + def reset(self) -> None: - """Resets the index.""" + """Reset the index.""" self.data = {} self.is_data_loaded = False diff --git a/python/docs/guide/modifying-data.md b/python/docs/guide/modifying-data.md index cd37b2fb40..9aac029290 100644 --- a/python/docs/guide/modifying-data.md +++ b/python/docs/guide/modifying-data.md @@ -336,7 +336,11 @@ the gory details), it's best to ensure that: ### Connecting to venues and SIGs -Volumes can be connected to venues by modifying the volume's `venue_ids` list. {==TODO: adding new venues==} +Volumes can be connected to venues by modifying the volume's `venue_ids` list. +New venues can be added by calling +[`VenueIndex.create()`][acl_anthology.venues.VenueIndex.create], which will also +create a corresponding YAML file upon saving. Afterwards, the ID used when +instantiating the venue can be used in a volume's `venue_ids`. {==TODO: connecting to SIGs; we may want to refactor how SIGs are represented before introducing this functionality.==} diff --git a/python/tests/venues_test.py b/python/tests/venues_test.py index b3bb931711..67da9b2c22 100644 --- a/python/tests/venues_test.py +++ b/python/tests/venues_test.py @@ -74,8 +74,26 @@ def test_venue_roundtrip_yaml(anthology_stub, tmp_path, venue_id): assert out == expected +def test_venueindex_create(anthology): + index = anthology.venues + venue = index.create( + id="acla", acronym="ACLA", name="ACL Anthology Workshop", is_acl=True + ) + assert "acla" in index + assert index["acla"] is venue + assert venue.acronym == "ACLA" + assert venue.name == "ACL Anthology Workshop" + assert venue.is_acl + assert venue.path.name == "acla.yaml" + + +def test_venueindex_create_with_invalid_id(anthology): + with pytest.raises(ValueError): + anthology.venues.create(id="acl-a", acronym="ACLA", name="ACL Anthology Workshop") + + def test_venueindex_cl(anthology): - index = VenueIndex(anthology) + index = anthology.venues venue = index.get("cl") assert venue.id == "cl" assert venue.acronym == "CL" From 8d750053b8a24a41f3388d532ff0ea3f5dfc0fb3 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Wed, 20 Aug 2025 14:56:01 +0200 Subject: [PATCH 11/19] Fix backslash escape --- python/tests/text/markuptext_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/tests/text/markuptext_test.py b/python/tests/text/markuptext_test.py index e1d8558140..c7e3e3fbbe 100644 --- a/python/tests/text/markuptext_test.py +++ b/python/tests/text/markuptext_test.py @@ -431,14 +431,14 @@ def test_markup_from_latex_maybe(inp, out1, out2): def test_markup_behaves_like_string(): markup = MarkupText.from_latex( - "TTCS$^{\mathcal{E}}$: a Vectorial Resource for Computing Conceptual Similarity" + "TTCS$^{\\mathcal{E}}$: a Vectorial Resource for Computing Conceptual Similarity" ) assert ( markup - == "TTCS^{\mathcal{E}}: a Vectorial Resource for Computing Conceptual Similarity" + == "TTCS^{\\mathcal{E}}: a Vectorial Resource for Computing Conceptual Similarity" ) assert markup == MarkupText.from_latex( - "TTCS$^{\mathcal{E}}$: a Vectorial Resource for Computing Conceptual Similarity" + "TTCS$^{\\mathcal{E}}$: a Vectorial Resource for Computing Conceptual Similarity" ) assert "Vectorial" in markup assert markup.startswith("TTCS") From 12d7d1698673e117b75c59c869178fe7b187e69c Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Thu, 21 Aug 2025 11:01:35 +0200 Subject: [PATCH 12/19] Fix a few documentation errors --- python/acl_anthology/collections/event.py | 2 +- python/docs/guide/accessing-authors.md | 6 +----- python/docs/guide/modifying-data.md | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/python/acl_anthology/collections/event.py b/python/acl_anthology/collections/event.py index 4b21482199..d120313906 100644 --- a/python/acl_anthology/collections/event.py +++ b/python/acl_anthology/collections/event.py @@ -103,7 +103,7 @@ class Event: is_explicit: True if this event was defined explicitly in the XML. Attributes: List Attributes: - colocated_ids: Tuples of volume IDs and their [`EventLinkingType`][acl_anthology.collections.types.EventLinkingType] that are colocated with this event. + colocated_ids: Tuples of volume IDs and their [`EventLink`][acl_anthology.collections.types.EventLink] that are colocated with this event. links: Links to materials for this event paper. The dictionary key specifies the type of link (e.g., "handbook" or "website"). talks: Zero or more references to talks belonging to this event. diff --git a/python/docs/guide/accessing-authors.md b/python/docs/guide/accessing-authors.md index 1349f93e32..261774299d 100644 --- a/python/docs/guide/accessing-authors.md +++ b/python/docs/guide/accessing-authors.md @@ -117,11 +117,7 @@ for name variants _written in a different script_, such as: ### Looking up name specifications In contrast to names, name specifications will _always_ resolve to a _single_ -person. This is enforced by our metadata checks; if name specifications are -ambiguous, they _must_ be resolved before the data can appear in the ACL -Anthology. - -To look up name specifications, use +person. To look up name specifications, use [`anthology.resolve`][acl_anthology.anthology.Anthology.resolve], which will return the person that is being referred to: diff --git a/python/docs/guide/modifying-data.md b/python/docs/guide/modifying-data.md index 9aac029290..8718c94b47 100644 --- a/python/docs/guide/modifying-data.md +++ b/python/docs/guide/modifying-data.md @@ -168,7 +168,7 @@ have an ORCID and other metadata) can be done in two ways: 2. Iterate through `p2.papers()` and `p2.volumes()` {==(TODO: a function to iterate through all items, no matter the type)==} and add `p1`'s new ID to the name specifications that are currently resolved to `p2`. {==TODO: It's currently a bit tricky to find the _name specification_ referring to a person; should add a function for this.==} -3. Save both the PersonIndex and the changed collections. {==TODO: The library current cannot track which collections have actually changed, so there is no "save all" function yet.==} +3. Save the changes, e.g. via `Anthology.save_all()`. ### Example: Disambiguating a person From 443ca0b324d455e47f186bfd1a846ba78f29d626 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Thu, 21 Aug 2025 11:44:46 +0200 Subject: [PATCH 13/19] Add tests for VenueIndex/SIGIndex.save and a few pragmas --- python/acl_anthology/text/markuptext.py | 6 +++--- python/acl_anthology/venues.py | 6 ++++-- python/tests/sigs_test.py | 10 ++++++++++ python/tests/venues_test.py | 10 ++++++++++ 4 files changed, 27 insertions(+), 5 deletions(-) diff --git a/python/acl_anthology/text/markuptext.py b/python/acl_anthology/text/markuptext.py index a235f196e1..dc30989774 100644 --- a/python/acl_anthology/text/markuptext.py +++ b/python/acl_anthology/text/markuptext.py @@ -108,21 +108,21 @@ class MarkupText: def __contains__(self, key: object) -> bool: if isinstance(key, str): return key in self.as_xml() - return False + return False # pragma: no cover def __eq__(self, other: object) -> bool: if isinstance(other, str): return self.as_xml() == other elif isinstance(other, MarkupText): return self.as_xml() == other.as_xml() - return False + return False # pragma: no cover def __lt__(self, other: object) -> bool: if isinstance(other, str): return self.as_xml() < other elif isinstance(other, MarkupText): return self.as_xml() < other.as_xml() - return False + return False # pragma: no cover def __str__(self) -> str: return self.as_text() diff --git a/python/acl_anthology/venues.py b/python/acl_anthology/venues.py index 5702dd3e53..fc7f8ae371 100644 --- a/python/acl_anthology/venues.py +++ b/python/acl_anthology/venues.py @@ -168,9 +168,11 @@ def create(self, id: str, acronym: str, name: str, **kwargs: Any) -> Venue: if "item_ids" in kwargs: raise KeyError( "Cannot specify `item_ids` for Venue; add its ID to the volume(s) instead." - ) + ) # pragma: no cover if "oldstyle_letter" in kwargs: - raise KeyError("Cannot specify a new venue with an old-style letter.") + raise KeyError( + "Cannot specify a new venue with an old-style letter." + ) # pragma: no cover kwargs["parent"] = self.parent kwargs["path"] = self.parent.datadir / "yaml" / "venues" / f"{id}.yaml" diff --git a/python/tests/sigs_test.py b/python/tests/sigs_test.py index c97de2bb41..92ef1d8b4e 100644 --- a/python/tests/sigs_test.py +++ b/python/tests/sigs_test.py @@ -14,6 +14,8 @@ import pytest from pathlib import Path +from unittest.mock import patch + from acl_anthology.sigs import SIGIndex, SIGMeeting, SIG @@ -115,3 +117,11 @@ def test_sig_by_volume(anthology): sigs = index.by_volume("2022.naloma-1") assert len(sigs) == len(all_toy_sigs) assert set(sig.id for sig in sigs) == set(all_toy_sigs) + + +def test_sigindex_save(anthology): + index = SIGIndex(anthology) + index.load() + with patch.object(SIG, "save") as mock: + index.save() + assert mock.call_count == len(index) diff --git a/python/tests/venues_test.py b/python/tests/venues_test.py index 67da9b2c22..e0b887ed4a 100644 --- a/python/tests/venues_test.py +++ b/python/tests/venues_test.py @@ -15,6 +15,8 @@ import logging import pytest from pathlib import Path +from unittest.mock import patch + from acl_anthology.venues import VenueIndex, Venue @@ -121,3 +123,11 @@ def test_venueindex_noindex(anthology, caplog): index = VenueIndex(anthology, no_item_ids=True) _ = index.get("cl").name assert not any("XML data file" in rec.message for rec in caplog.records) + + +def test_venueindex_save(anthology): + index = VenueIndex(anthology) + index.load() + with patch.object(Venue, "save") as mock: + index.save() + assert mock.call_count == len(index) From 22b8d6e34b332b719b1d515e2421857aeec25bb7 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Thu, 21 Aug 2025 12:57:12 +0200 Subject: [PATCH 14/19] Add {Paper|Volume}.namespecs and Person.anthology_items() --- python/acl_anthology/collections/paper.py | 5 +++++ python/acl_anthology/collections/volume.py | 5 +++++ python/acl_anthology/people/person.py | 10 ++++++++++ python/docs/guide/accessing-authors.md | 10 ++++++---- python/tests/collections/paper_test.py | 14 ++++++++++++++ python/tests/collections/volume_test.py | 2 ++ python/tests/people/person_test.py | 1 + 7 files changed, 43 insertions(+), 4 deletions(-) diff --git a/python/acl_anthology/collections/paper.py b/python/acl_anthology/collections/paper.py index c5ef86c884..617fce6427 100644 --- a/python/acl_anthology/collections/paper.py +++ b/python/acl_anthology/collections/paper.py @@ -464,6 +464,11 @@ def web_url(self) -> str: """The URL of this paper's landing page on the ACL Anthology website.""" return cast(str, config["paper_page_template"]).format(self.full_id) + @property + def namespecs(self) -> list[NameSpecification]: + """All name specifications on this paper.""" + return self.authors + self.editors + def get_editors(self) -> list[NameSpecification]: """ Returns: diff --git a/python/acl_anthology/collections/volume.py b/python/acl_anthology/collections/volume.py index 1434397bd2..029d3e452a 100644 --- a/python/acl_anthology/collections/volume.py +++ b/python/acl_anthology/collections/volume.py @@ -176,6 +176,11 @@ def web_url(self) -> str: """The URL of this volume's landing page on the ACL Anthology website.""" return cast(str, config["volume_page_template"]).format(self.full_id) + @property + def namespecs(self) -> list[NameSpecification]: + """All name specifications on this volume.""" + return self.editors + def get_events(self) -> list[Event]: """ Returns: diff --git a/python/acl_anthology/people/person.py b/python/acl_anthology/people/person.py index 36a545989b..b50329d477 100644 --- a/python/acl_anthology/people/person.py +++ b/python/acl_anthology/people/person.py @@ -264,6 +264,16 @@ def namespec_refers_to_self(namespec: NameSpecification) -> bool: namespec.id = new_id volume.collection.is_modified = True + def anthology_items(self) -> Iterator[Paper | Volume]: + """Returns an iterator over all Anthology items associated with this person, regardless of their type.""" + for anthology_id in self.item_ids: + item = self.parent.get(anthology_id) + if item is None: + raise ValueError( + f"Person {self.id} lists associated item {build_id_from_tuple(anthology_id)}, which doesn't exist" + ) # pragma: no cover + yield item + def papers(self) -> Iterator[Paper]: """Returns an iterator over all papers associated with this person. diff --git a/python/docs/guide/accessing-authors.md b/python/docs/guide/accessing-authors.md index 261774299d..a2d3614b3c 100644 --- a/python/docs/guide/accessing-authors.md +++ b/python/docs/guide/accessing-authors.md @@ -174,10 +174,12 @@ You can get a set of all items associated with a person: {('Q18', '1', '28'), ('2020.findings', 'emnlp', '158'), ('W11', '15', '16'), ...} ``` -For convenience, you can also use -[`Person.volumes()`][acl_anthology.people.person.Person.volumes] and -[`Person.papers()`][acl_anthology.people.person.Person.papers] to iterate over -the set of volumes/papers that person is associated with. +You can iterate over +[`Person.anthology_items()`][acl_anthology.people.person.Person.anthology_items] +to get the actual items the person is associated with. If you know that you only +want to iterate over papers or volumes, you can also use +[`Person.volumes()`][acl_anthology.people.person.Person.volumes] or +[`Person.papers()`][acl_anthology.people.person.Person.papers] instead. ## An Entity-Relationship diagram diff --git a/python/tests/collections/paper_test.py b/python/tests/collections/paper_test.py index b6e116e851..4f65311174 100644 --- a/python/tests/collections/paper_test.py +++ b/python/tests/collections/paper_test.py @@ -61,6 +61,20 @@ def test_paper_web_url(anthology): assert paper.web_url == "https://aclanthology.org/2022.acl-demo.2/" +def test_paper_namespecs(): + authors = [NameSpecification("John", "Doe")] + editors = [NameSpecification("Jane", "Doe")] + paper = Paper( + "42", + None, + bibkey="nn-2025-conthrived", + title="A conthrived example just for testing", + authors=authors, + editors=editors, + ) + assert paper.namespecs == authors + editors + + def test_paper_get_events(anthology): paper = anthology.get_paper("2022.acl-demo.2") assert paper is not None diff --git a/python/tests/collections/volume_test.py b/python/tests/collections/volume_test.py index cf9401a737..9969a73e5f 100644 --- a/python/tests/collections/volume_test.py +++ b/python/tests/collections/volume_test.py @@ -192,6 +192,8 @@ def test_volume_attributes_2022acl_demo(anthology): assert volume.venue_acronym == "ACL" assert not volume.is_workshop assert isinstance(volume.frontmatter, Paper) and volume.frontmatter.id == "0" + assert len(volume.editors) == 3 + assert volume.editors == volume.namespecs def test_volume_attributes_j89(anthology): diff --git a/python/tests/people/person_test.py b/python/tests/people/person_test.py index bc8cd6aa36..f4e292316b 100644 --- a/python/tests/people/person_test.py +++ b/python/tests/people/person_test.py @@ -131,6 +131,7 @@ def test_person_papers_unverified(anthology): person = anthology.get_person("unverified/nicoletta-calzolari") assert person.canonical_name == Name("Nicoletta", "Calzolari") assert len(person.item_ids) == 3 + assert len(list(person.anthology_items())) == 3 assert len(list(person.papers())) == 2 assert len(list(person.volumes())) == 1 From 26047b7a024b214157ceec900aabe0eb79950897 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Thu, 21 Aug 2025 14:04:19 +0200 Subject: [PATCH 15/19] Refactor update_id -> change_id, add merge_with_explicit --- python/acl_anthology/people/index.py | 7 ++ python/acl_anthology/people/person.py | 100 ++++++++++++------ python/docs/guide/modifying-data.md | 7 +- .../tests/data/anthology/xml/2022.naloma.xml | 7 ++ python/tests/people/person_test.py | 44 ++++++-- 5 files changed, 118 insertions(+), 47 deletions(-) diff --git a/python/acl_anthology/people/index.py b/python/acl_anthology/people/index.py index c94b1d83ad..6b96f45be9 100644 --- a/python/acl_anthology/people/index.py +++ b/python/acl_anthology/people/index.py @@ -392,9 +392,16 @@ def _update_id(self, old_id: str, new_id: str) -> None: Parameters: old_id: A person ID that already exists in the index. new_id: The new person ID it should be changed to, which mustn't exist in the index. + + Raises: + KeyError: If the new ID already exists. """ if not self.is_data_loaded: return + if new_id in self.data: + raise KeyError( + f"Tried to add ID '{new_id}' to PersonIndex which already exists" + ) person = self.data.pop(old_id) self.data[new_id] = person # Note: cannot remove from DisjointSet diff --git a/python/acl_anthology/people/person.py b/python/acl_anthology/people/person.py index b50329d477..a4deeeb14f 100644 --- a/python/acl_anthology/people/person.py +++ b/python/acl_anthology/people/person.py @@ -17,7 +17,6 @@ import attrs from attrs import define, field, setters from enum import Enum -import itertools as it from typing import Any, Iterator, Optional, Sequence, TYPE_CHECKING from ..exceptions import AnthologyException, AnthologyInvalidIDError from ..utils.attrs import auto_validate_types @@ -77,7 +76,7 @@ class Person: Person objects **can** be used to make changes to metadata that appears in `people.yaml`, such as ORCID, comment, degree, and alternative names for this person. Attributes: - id: A unique ID for this person. + id: A unique ID for this person. Do not change this attribute directly; use [`change_id()`][acl_anthology.people.person.Person.change_id], [`make_explicit()`][acl_anthology.people.person.Person.make_explicit], or [`merge_with_explicit()`][acl_anthology.people.person.Person.merge_with_explicit] instead. parent: The parent Anthology instance to which this person belongs. item_ids: A list of volume and/or paper IDs this person has authored or edited. orcid: The person's ORCID. @@ -205,6 +204,34 @@ def has_name(self, name: Name) -> bool: """ return any(existing_name == name for (existing_name, _) in self._names) + def change_id(self, new_id: str) -> None: + """Change this person's ID. + + This updates `self.id`, but also ensures that all papers/items with the old ID are updated to the new one. + + Parameters: + new_id: The new ID for this person, which must match [`RE_VERIFIED_PERSON_ID`][acl_anthology.utils.ids.RE_VERIFIED_PERSON_ID]. + + Raises: + AnthologyException: If `self.explicit` is False. + AnthologyInvalidIDError: If the supplied ID is not well-formed, or if it already exists in the PersonIndex. + """ + if new_id in self.parent.people: + exc = AnthologyInvalidIDError(new_id, f"Person ID already exists: {new_id}") + exc.add_note("Did you want to use merge_with_explicit() instead?") + raise exc + if not self.is_explicit: + exc2 = AnthologyException("Can only update ID for explicit person") + exc2.add_note("Did you want to use make_explicit() instead?") + raise exc2 + if not is_verified_person_id(new_id): + raise AnthologyInvalidIDError( + new_id, f"Not a valid verified-person ID: {new_id}" + ) + + self._set_id_on_items(new_id) + self.id = new_id # triggers update in PersonIndex + def make_explicit(self, new_id: str) -> None: """Turn this person that was implicitly created into an explicitly-represented one. @@ -218,51 +245,59 @@ def make_explicit(self, new_id: str) -> None: ValueError: If the supplied ID is not valid, or if it already exists in the PersonIndex. """ if self.is_explicit: - raise AnthologyException("Person is already explicit") + raise AnthologyException(f"Person '{self.id}' is already explicit") + if not is_verified_person_id(new_id): + raise AnthologyInvalidIDError( + new_id, f"Not a valid verified-person ID: {new_id}" + ) + self._set_id_on_items(new_id) self.is_explicit = True - self.update_id(new_id) + self.id = new_id # triggers update in PersonIndex self._names = [(name, NameLink.EXPLICIT) for name, _ in self._names] - def update_id(self, new_id: str) -> None: - """Update the ID of this person, including on all of their associated papers. + def merge_with_explicit(self, person: Person) -> None: + """Merge this person that was implicitly created with an explicitly-represented one. - In contrast to simply changing the `id` attribute, this function will go through all associated papers and update the ID attribute there. + This will add the explicit person's ID to all papers and volumes currently associated with this inferred person. Parameters: - new_id: The new ID for this person, which must match [`RE_VERIFIED_PERSON_ID`][acl_anthology.utils.ids.RE_VERIFIED_PERSON_ID]. + person: An explicit person to merge this person's items into. Raises: - AnthologyException: If `self.is_explicit` is False. - AnthologyInvalidIDError: If the supplied ID is not valid, or if it already exists in the PersonIndex. + AnthologyException: If `self.explicit` is True or `person.explicit` is False. """ - if not self.is_explicit: - exc = AnthologyException("Can only update ID for explicit person") - exc.add_note("Did you want to use make_explicit() instead?") - raise exc - if not is_verified_person_id(new_id): - raise AnthologyInvalidIDError( - new_id, f"Not a valid verified-person ID: {new_id}" + if self.is_explicit: + raise AnthologyException("Can only merge non-explicit persons") + if not person.is_explicit: + raise AnthologyException( + f"Can only merge with explicit persons; not '{person.id}'" ) - old_id = self.id + self._set_id_on_items(person.id) + person.item_ids.extend(self.item_ids) + self.item_ids = [] - def namespec_refers_to_self(namespec: NameSpecification) -> bool: - if is_verified_person_id(old_id): - return namespec.id == old_id - return namespec.id is None and self.has_name(namespec.name) + for name in self.names: + person.add_name(name, inferred=False) - self.id = new_id # will update PersonIndex - for paper in self.papers(): - for namespec in it.chain(paper.authors, paper.editors): - if namespec_refers_to_self(namespec): - namespec.id = new_id - paper.collection.is_modified = True - for volume in self.volumes(): - for namespec in volume.editors: + def _set_id_on_items(self, new_id: str) -> None: + """Set `new_id` on all name specifications that currently resolve to this person. Intended for internal use.""" + if self.is_explicit: + + def namespec_refers_to_self(namespec: NameSpecification) -> bool: + return namespec.id == self.id + + else: + + def namespec_refers_to_self(namespec: NameSpecification) -> bool: + return self.parent.resolve(namespec) is self + + for item in self.anthology_items(): + for namespec in item.namespecs: if namespec_refers_to_self(namespec): namespec.id = new_id - volume.collection.is_modified = True + item.collection.is_modified = True def anthology_items(self) -> Iterator[Paper | Volume]: """Returns an iterator over all Anthology items associated with this person, regardless of their type.""" @@ -272,7 +307,8 @@ def anthology_items(self) -> Iterator[Paper | Volume]: raise ValueError( f"Person {self.id} lists associated item {build_id_from_tuple(anthology_id)}, which doesn't exist" ) # pragma: no cover - yield item + # TODO: typing issue will be resolved later with CollectionItem refactoring + yield item # type: ignore def papers(self) -> Iterator[Paper]: """Returns an iterator over all papers associated with this person. diff --git a/python/docs/guide/modifying-data.md b/python/docs/guide/modifying-data.md index 8718c94b47..58e73d861c 100644 --- a/python/docs/guide/modifying-data.md +++ b/python/docs/guide/modifying-data.md @@ -164,9 +164,9 @@ have an ORCID and other metadata) can be done in two ways: **Situation:** An author has published under multiple names, and therefore two separate persons get instantiated for them (let's call them `p1` and `p2`). We want to merge them into a single person. -1. If neither `p1` nor `p2` are _explicit_: Call [`p1.make_explicit()`][acl_anthology.people.person.Person.make_explicit]. This will create an entry in `people.yaml` with all current names of `p1` and add the new ID to all papers and volumes currently inferred to belong to `p1`. +1. If neither person is _explicit_ yet: Call [`p1.make_explicit()`][acl_anthology.people.person.Person.make_explicit]. This will create an entry in `people.yaml` with all current names of `p1` add the new ID to all papers and volumes currently inferred to belong to either `p1`. -2. Iterate through `p2.papers()` and `p2.volumes()` {==(TODO: a function to iterate through all items, no matter the type)==} and add `p1`'s new ID to the name specifications that are currently resolved to `p2`. {==TODO: It's currently a bit tricky to find the _name specification_ referring to a person; should add a function for this.==} +2. `p1` can now assumed to be explicit. If `p2` is not explicit, call [`p2.merge_with_explicit(p1)`][acl_anthology.people.person.Person.merge_with_explicit]. This will add all of `p2`'s names to `p1` and set `p1`'s ID on all papers and volumes currently inferred to belong to `p2`. 3. Save the changes, e.g. via `Anthology.save_all()`. @@ -176,8 +176,7 @@ have an ORCID and other metadata) can be done in two ways: 1. Call [`anthology.people.create()`][acl_anthology.people.index.PersonIndex.create] for all persons who do not have an explicit ID yet, giving all the names that can refer to this person. Also supply the ORCID when calling this function, if it is known. -2. For each person, iterate through the papers that actually belong to them and update the name specification that currently resolves to `p1` by setting the explicit ID of the correct newly-created person. {==TODO: Same as above: It's currently a bit tricky to find the _name specification_ referring to a person; should add a function for this.==} - +2. For each person, go through the papers that actually belong to them and update the name specification where `namespec.id == p1` by setting the explicit ID of the correct newly-created person. {==TODO==} ## Ingesting new proceedings diff --git a/python/tests/data/anthology/xml/2022.naloma.xml b/python/tests/data/anthology/xml/2022.naloma.xml index 748c056796..18f352aa85 100644 --- a/python/tests/data/anthology/xml/2022.naloma.xml +++ b/python/tests/data/anthology/xml/2022.naloma.xml @@ -64,5 +64,12 @@ 2022.naloma-1.5 gubelmann-etal-2022-philosophically + + Fake Paper + YangLiu + 51–52 + 2022.naloma-1.5 + liu-2022-fake + diff --git a/python/tests/people/person_test.py b/python/tests/people/person_test.py index f4e292316b..2e16b99066 100644 --- a/python/tests/people/person_test.py +++ b/python/tests/people/person_test.py @@ -143,40 +143,46 @@ def test_person_papers_verified(anthology): assert len(list(person.papers())) == 2 -def test_person_update_id(anthology): +def test_person_change_id(anthology): person = anthology.get_person("marcel-bollmann") - person.update_id("marcel-bollmann-rub") + person.change_id("marcel-bollmann-rub") assert anthology.get_person("marcel-bollmann") is None assert anthology.get_person("marcel-bollmann-rub") is person - person.update_id("marcel-bollmann") + person.change_id("marcel-bollmann") assert anthology.get_person("marcel-bollmann") is person assert anthology.get_person("marcel-bollmann-rub") is None -def test_person_update_id_should_update_connected_papers(anthology): +def test_person_change_id_should_update_connected_papers(anthology): person = anthology.get_person("yang-liu-ict") - person.update_id("yang-liu-new") + person.change_id("yang-liu-new") namespec = anthology.get(person.item_ids[0]).authors[-1] assert namespec.name == Name("Yang", "Liu") assert namespec.id == "yang-liu-new" assert anthology.collections["2022.acl"].is_modified -def test_person_cannot_update_id_when_inferred(anthology): +def test_person_cannot_change_id_when_inferred(anthology): person = anthology.get_person("unverified/nicoletta-calzolari") assert not person.is_explicit with pytest.raises(AnthologyException): - person.update_id("nicoletta-calzolari") + person.change_id("nicoletta-calzolari") -def test_person_cannot_update_id_with_invalid_id(anthology): +def test_person_cannot_change_id_with_invalid_id(anthology): person = anthology.get_person("marcel-bollmann") with pytest.raises(AnthologyInvalidIDError): - person.update_id("Marcel-Bollmann") + person.change_id("Marcel-Bollmann") with pytest.raises(AnthologyInvalidIDError): - person.update_id("42-marcel-bollmann") + person.change_id("42-marcel-bollmann") with pytest.raises(AnthologyInvalidIDError): - person.update_id("marcel bollmann") + person.change_id("marcel bollmann") + + +def test_person_cannot_change_id_to_existing_id(anthology): + person = anthology.get_person("marcel-bollmann") + with pytest.raises(AnthologyInvalidIDError): + person.change_id("yang-liu-ict") def test_person_with_name_variants(anthology): @@ -213,6 +219,22 @@ def test_person_make_explicit_should_raise_when_explicit(anthology): person.make_explicit("marcel-bollmann") +def test_person_merge_with_explicit(anthology): + # Pre-conditions + person1 = anthology.get_person("unverified/yang-liu") + assert not person1.is_explicit + assert person1.item_ids == [("2022.naloma", "1", "6")] + person2 = anthology.get_person("yang-liu-microsoft") + assert person2.item_ids == [("2022.acl", "long", "226")] + + # Test merging + person1.merge_with_explicit(person2) + assert not person1.item_ids + assert person2.item_ids == [("2022.acl", "long", "226"), ("2022.naloma", "1", "6")] + namespec = anthology.get_paper(("2022.naloma", "1", "6")).authors[0] + assert namespec.id == "yang-liu-microsoft" + + def test_person_equality(anthology_stub): n = Name("Yang", "Liu") person1 = Person("yang-liu", anthology_stub, [n]) From 8ff8694357cb8f0fa24bd374ea302ab4a4c4b10c Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Thu, 21 Aug 2025 14:06:17 +0200 Subject: [PATCH 16/19] Update CHANGELOG --- python/CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/CHANGELOG.md b/python/CHANGELOG.md index 9015d6b8d0..554619474a 100644 --- a/python/CHANGELOG.md +++ b/python/CHANGELOG.md @@ -11,8 +11,9 @@ This release implements the new [name resolution and author ID logic](https://gi - Person: - Now provides `orcid`, `degree`, `disable_name_matching`, and `similar_ids` fields that correspond to the respective fields in the new `people.yaml`. - Changing `id`, `orcid`, `names`, or using `add_name()` or `remove_names()` will now automatically update the PersonIndex. - - Added `update_id()` that updates a person's ID on all of their connected papers. + - Added `change_id()` that updates a person's ID on all of their connected papers. - Added `make_explicit()` that makes all necessary changes to change an implicit ("unverified/") to an explicit Person. + - Added `merge_with_explicit()` that makes all necessary changes to move an implicit ("unverified/") person's papers/volumes to an explicit Person. - PersonIndex: - Now also indexes Person objects by ORCID, and provides `by_orcid` and `get_by_orcid()`. - Now also keeps a mapping of name slugs to (verified) person IDs, via `slugs_to_verified_ids` (mostly for internal use). From 26a65a27f61254fe6816586e0e6d3774a48562ce Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Thu, 21 Aug 2025 15:18:44 +0200 Subject: [PATCH 17/19] Fix tests --- python/tests/anthology_test.py | 4 ++-- python/tests/collections/collection_test.py | 2 +- python/tests/people/person_test.py | 2 ++ python/tests/people/personindex_test.py | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/python/tests/anthology_test.py b/python/tests/anthology_test.py index 0f6a249d3f..c3d90baf1f 100644 --- a/python/tests/anthology_test.py +++ b/python/tests/anthology_test.py @@ -116,7 +116,7 @@ def test_papers(anthology): count += 1 found.add(paper.collection_id) assert expected == found - assert count == 851 + assert count == 852 def test_papers_by_collection_id(anthology): @@ -124,7 +124,7 @@ def test_papers_by_collection_id(anthology): for paper in anthology.papers("2022.naloma"): assert paper.collection_id == "2022.naloma" count += 1 - assert count == 6 + assert count == 7 def test_papers_by_volume_id(anthology): diff --git a/python/tests/collections/collection_test.py b/python/tests/collections/collection_test.py index b6fb6665f5..975abe7bf1 100644 --- a/python/tests/collections/collection_test.py +++ b/python/tests/collections/collection_test.py @@ -32,7 +32,7 @@ test_cases_xml_collections = ( # (filename, # volumes, # papers, has event?) ("2022.acl.xml", 5, 779, True), - ("2022.naloma.xml", 1, 6, False), + ("2022.naloma.xml", 1, 7, False), ("J89.xml", 4, 61, False), ("L06.xml", 1, 5, False), ) diff --git a/python/tests/people/person_test.py b/python/tests/people/person_test.py index 2e16b99066..c9707cb470 100644 --- a/python/tests/people/person_test.py +++ b/python/tests/people/person_test.py @@ -234,6 +234,8 @@ def test_person_merge_with_explicit(anthology): namespec = anthology.get_paper(("2022.naloma", "1", "6")).authors[0] assert namespec.id == "yang-liu-microsoft" + anthology.reset_indices() + def test_person_equality(anthology_stub): n = Name("Yang", "Liu") diff --git a/python/tests/people/personindex_test.py b/python/tests/people/personindex_test.py index f23e8a1e0c..b9804f595c 100644 --- a/python/tests/people/personindex_test.py +++ b/python/tests/people/personindex_test.py @@ -156,7 +156,7 @@ def test_get_person_coauthors_counter(index): def test_get_by_namespec(index): - ns1 = NameSpecification(Name("Yang", "Liu")) + ns1 = NameSpecification(Name("Li", "Feng")) # does not exist ns2 = NameSpecification(Name("Yang", "Liu"), id="yang-liu-microsoft") with pytest.raises(NameSpecResolutionError): index.get_by_namespec(ns1) From fed153e7467116d712b224e863ea3866223cbf74 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Thu, 21 Aug 2025 18:21:04 +0200 Subject: [PATCH 18/19] MarkupText: Make __contains__ check raw and XML text; add __len__ --- python/acl_anthology/text/markuptext.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/acl_anthology/text/markuptext.py b/python/acl_anthology/text/markuptext.py index dc30989774..49a444d93b 100644 --- a/python/acl_anthology/text/markuptext.py +++ b/python/acl_anthology/text/markuptext.py @@ -107,7 +107,7 @@ class MarkupText: def __contains__(self, key: object) -> bool: if isinstance(key, str): - return key in self.as_xml() + return (key in self.as_text()) or (key in self.as_xml()) return False # pragma: no cover def __eq__(self, other: object) -> bool: @@ -124,6 +124,9 @@ def __lt__(self, other: object) -> bool: return self.as_xml() < other.as_xml() return False # pragma: no cover + def __len__(self) -> int: + return len(self.as_text()) + def __str__(self) -> str: return self.as_text() From e9efacebfe29548fa17d3f99a6406c937a9cc9a7 Mon Sep 17 00:00:00 2001 From: Marcel Bollmann Date: Thu, 21 Aug 2025 18:21:41 +0200 Subject: [PATCH 19/19] Add justfile in main Anthology folder --- justfile | 7 +++++++ python/justfile | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 justfile diff --git a/justfile b/justfile new file mode 100644 index 0000000000..8ede6e6b93 --- /dev/null +++ b/justfile @@ -0,0 +1,7 @@ +@_default: + just -l + echo -e "\npython:" + just -l python + +# Call recipes from the Python library +mod python diff --git a/python/justfile b/python/justfile index 0e47f67d39..953176920f 100644 --- a/python/justfile +++ b/python/justfile @@ -34,7 +34,7 @@ fix-and-test: fix test-all # Run all tests test-all *FLAGS: _deps - poetry run pytest -m "not integration" {{FLAGS}} + poetry run pytest -m "not integration" --stepwise {{FLAGS}} # Run all tests and generate coverage report test-with-coverage: _deps