Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions justfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
@_default:
just -l
echo -e "\npython:"
just -l python

# Call recipes from the Python library
mod python
11 changes: 9 additions & 2 deletions python/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,22 @@ This release implements the new [name resolution and author ID logic](https://gi

### Added

- Anthology now provides `save_all()` to conveniently save all data files. The library tracks modifications to collection objects to only write XML files that have actually changed.
- NameSpecification now provides an `orcid` field.
- Person:
- Now provides `orcid`, `degree`, `disable_name_matching`, and `similar_ids` fields that correspond to the respective fields in the new `people.yaml`.
- Changing `id`, `orcid`, `names`, or using `add_name()` or `remove_names()` will now automatically update the PersonIndex.
- Added `update_id()` that updates a person's ID on all of their connected papers.
- Added `change_id()` that updates a person's ID on all of their connected papers.
- Added `make_explicit()` that makes all necessary changes to change an implicit ("unverified/") to an explicit Person.
- Added `merge_with_explicit()` that makes all necessary changes to move an implicit ("unverified/") person's papers/volumes to an explicit Person.
- PersonIndex:
- Now also indexes Person objects by ORCID, and provides `by_orcid` and `get_by_orcid()`.
- Now also keeps a mapping of name slugs to (verified) person IDs, via `slugs_to_verified_ids` (mostly for internal use).
- Added `ingest_namespec()` to implement the [matching logic on ingestion](https://github.com/acl-org/acl-anthology/wiki/Author-Page-Plan#ingestion) of new volumes.
- Added `create_person()` to instantiate a new Person and add it to the index.
- Added `PersonIndex.create` to instantiate a new Person and add it to the index.
- MarkupText now provides a `from_()` class method that calls the appropriate builder method, using heuristic markup parsing if instantiated from a string.
- MarkupText now supports some common string methods, such as `__contains__`, `endswith`, `startswith`.
- Venues can now be created via `VenueIndex.create()`.

### Changed

Expand All @@ -28,6 +33,8 @@ This release implements the new [name resolution and author ID logic](https://gi
- Changed the previously experimental `save()` function to serialize the `people.yaml` file.
- Person now stores names as tuples of `(Name, NameLink)`, the latter of which indicates if the name was explicitly defined in `people.yaml` or inferred by the name resolution logic (e.g. via slug matching). As a consequence, `Person.names` can no longer be modified in-place; use `Person.add_name()`, `Person.remove_name()`, or the setter of `Person.names`.
- Setting a canonical name for a Person changed from `.set_canonical_name()` to `Person.canonical_name = ...`
- Attributes that expect a MarkupText, such as `Volume.title` or `Paper.abstract`, can now be set to a string, in which case the string will be automatically converted to MarkupText, including markup parsing.
- EventLinkingType renamed to EventLink.

## [0.5.3] — 2025-06-22

Expand Down
14 changes: 14 additions & 0 deletions python/acl_anthology/anthology.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,20 @@ def load_all(self) -> Self:
)
return self

def save_all(self) -> Self:
"""Save all Anthology data files."""
for collection in self.collections.values():
if collection.is_modified:
collection.save()
self.people.save()
self.venues.save()
warnings.warn(
UserWarning(
"SIG metadata is not yet automatically saved. Call `.sigs.save()` manually if you need this."
)
)
return self

def reset_indices(self) -> Self:
"""Reset all non-collection indices.

Expand Down
4 changes: 2 additions & 2 deletions python/acl_anthology/collections/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from .eventindex import EventIndex
from .event import Event, Talk
from .volume import Volume
from .types import EventLinkingType, PaperDeletionType, PaperType, VolumeType
from .types import EventLink, PaperDeletionType, PaperType, VolumeType
from .paper import Paper


Expand All @@ -28,7 +28,7 @@
"CollectionIndex",
"Event",
"EventIndex",
"EventLinkingType",
"EventLink",
"Paper",
"PaperDeletionType",
"PaperType",
Expand Down
15 changes: 10 additions & 5 deletions python/acl_anthology/collections/collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from ..utils.logging import get_logger
from ..utils import xml
from .event import Event
from .types import EventLinkingType, VolumeType
from .types import EventLink, VolumeType
from .volume import Volume
from .paper import Paper

Expand Down Expand Up @@ -63,6 +63,7 @@ class Collection(SlottedDict[Volume]):
Attributes: Non-Init Attributes:
event: An event represented by this collection.
is_data_loaded: A flag indicating whether the XML file has already been loaded.
is_modified: A flag indicating whether any of the data in this collection has been modified after loading.
"""

id: str = field(converter=int_to_str)
Expand All @@ -75,6 +76,7 @@ class Collection(SlottedDict[Volume]):
validator=v.optional(v.instance_of(Event)),
)
is_data_loaded: bool = field(init=False, repr=True, default=False)
is_modified: bool = field(init=False, repr=False, default=False)

@id.validator
def _check_id(self, _: Any, value: str) -> None:
Expand Down Expand Up @@ -148,7 +150,7 @@ def validate_schema(self) -> Self:
def create_volume(
self,
id: str,
title: MarkupText,
title: MarkupText | str,
year: Optional[str] = None,
type: VolumeType = VolumeType.PROCEEDINGS,
**kwargs: Any,
Expand All @@ -157,7 +159,7 @@ def create_volume(

Parameters:
id: The ID of the new volume.
title: The title of the new volume.
title: The title of the new volume. If given as a string, it will be [heuristically parsed for markup][acl_anthology.text.markuptext.MarkupText.from_].
year: The year of the new volume (optional); if None, will infer the year from this collection's ID.
type: Whether this is a journal or proceedings volume; defaults to [VolumeType.PROCEEDINGS][acl_anthology.collections.types.VolumeType].
**kwargs: Any valid list or optional attribute of [Volume][acl_anthology.collections.volume.Volume].
Expand Down Expand Up @@ -200,6 +202,7 @@ def create_volume(
self.root.people._add_to_index(volume.editors, volume.full_id_tuple)

self.data[id] = volume
self.is_modified = True
return volume

def create_event(
Expand Down Expand Up @@ -242,6 +245,7 @@ def create_event(
**kwargs,
)
self.root.events._add_to_index(self.event)
self.is_modified = True
return self.event

def load(self) -> None:
Expand Down Expand Up @@ -286,15 +290,16 @@ def load(self) -> None:
if self.event is not None:
# Events are implicitly linked to volumes defined in the same collection
self.event.colocated_ids = [
(volume.full_id_tuple, EventLinkingType.INFERRED)
(volume.full_id_tuple, EventLink.INFERRED)
for volume in self.data.values()
# Edge case: in case the <colocated> block lists a volume in
# the same collection, don't add it twice
if (volume.full_id_tuple, EventLinkingType.EXPLICIT)
if (volume.full_id_tuple, EventLink.EXPLICIT)
not in self.event.colocated_ids
] + self.event.colocated_ids

self.is_data_loaded = True
self.is_modified = False

def save(self, path: Optional[StrPath] = None, minimal_diff: bool = True) -> None:
"""Saves this collection as an XML file.
Expand Down
43 changes: 26 additions & 17 deletions python/acl_anthology/collections/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,17 @@

from __future__ import annotations

from attrs import define, field, validators as v
from attrs import define, field, converters, setters, validators as v
from lxml import etree
from lxml.builder import E
from typing import Any, Iterator, Optional, TYPE_CHECKING

from .types import EventLinkingType
from .types import EventLink
from ..constants import RE_EVENT_ID
from ..files import EventFileReference
from ..people import NameSpecification
from ..text import MarkupText
from ..utils.attrs import auto_validate_types
from ..text import MarkupText, to_markuptext
from ..utils.attrs import auto_validate_types, track_modifications
from ..utils.ids import AnthologyID, AnthologyIDTuple, parse_id, build_id_from_tuple

if TYPE_CHECKING:
Expand All @@ -44,7 +44,7 @@ class Talk:
attachments: Links to attachments for this talk. The dictionary key specifies the type of attachment (e.g., "video" or "slides").
"""

title: MarkupText = field()
title: MarkupText = field(converter=to_markuptext)
type: Optional[str] = field(default=None)
speakers: list[NameSpecification] = field(factory=list)
attachments: dict[str, EventFileReference] = field(factory=dict)
Expand Down Expand Up @@ -87,7 +87,10 @@ def to_xml(self) -> etree._Element:
return elem


@define(field_transformer=auto_validate_types)
@define(
field_transformer=auto_validate_types,
on_setattr=[setters.convert, setters.validate, track_modifications],
)
class Event:
"""An event, such as a meeting or a conference.

Expand All @@ -100,7 +103,7 @@ class Event:
is_explicit: True if this event was defined explicitly in the XML.

Attributes: List Attributes:
colocated_ids: Tuples of volume IDs and their [`EventLinkingType`][acl_anthology.collections.types.EventLinkingType] that are colocated with this event.
colocated_ids: Tuples of volume IDs and their [`EventLink`][acl_anthology.collections.types.EventLink] that are colocated with this event.
links: Links to materials for this event paper. The dictionary key specifies the type of link (e.g., "handbook" or "website").
talks: Zero or more references to talks belonging to this event.

Expand All @@ -112,9 +115,9 @@ class Event:

id: str = field(validator=v.matches_re(RE_EVENT_ID))
parent: Collection = field(repr=False, eq=False)
is_explicit: bool = field(default=False, converter=bool)
is_explicit: bool = field(default=False, converter=bool) # TODO: freeze?

colocated_ids: list[tuple[AnthologyIDTuple, EventLinkingType]] = field(
colocated_ids: list[tuple[AnthologyIDTuple, EventLink]] = field(
factory=list,
repr=lambda x: f"<list of {len(x)} tuples>",
)
Expand All @@ -128,10 +131,17 @@ class Event:
),
)

title: Optional[MarkupText] = field(default=None)
title: Optional[MarkupText] = field(
default=None, converter=converters.optional(to_markuptext)
)
location: Optional[str] = field(default=None)
dates: Optional[str] = field(default=None)

@property
def collection(self) -> Collection:
"""The collection this event belongs to."""
return self.parent

@property
def collection_id(self) -> str:
"""The collection ID this event belongs to."""
Expand All @@ -156,7 +166,7 @@ def volumes(self) -> Iterator[Volume]:
def add_colocated(
self,
volume: Volume | AnthologyID,
type_: EventLinkingType = EventLinkingType.EXPLICIT,
type_: EventLink = EventLink.EXPLICIT,
) -> None:
"""Add a co-located volume to this event.

Expand All @@ -175,14 +185,13 @@ def add_colocated(

for idx, (existing_id, existing_type) in enumerate(self.colocated_ids):
if volume_id == existing_id:
if (
existing_type == EventLinkingType.INFERRED
and type_ == EventLinkingType.EXPLICIT
):
if existing_type == EventLink.INFERRED and type_ == EventLink.EXPLICIT:
self.colocated_ids[idx] = (volume_id, type_)
return

self.colocated_ids.append((volume_id, type_))
if type_ == EventLink.EXPLICIT:
self.collection.is_modified = True

# Update the event index as well
if self.root.events.is_data_loaded:
Expand Down Expand Up @@ -213,7 +222,7 @@ def from_xml(cls, parent: Collection, event: etree._Element) -> Event:
kwargs["talks"].append(Talk.from_xml(element))
elif element.tag == "colocated":
kwargs["colocated_ids"] = [
(parse_id(str(volume_id.text)), EventLinkingType.EXPLICIT)
(parse_id(str(volume_id.text)), EventLink.EXPLICIT)
for volume_id in element
if volume_id.tag == "volume-id"
]
Expand Down Expand Up @@ -252,7 +261,7 @@ def to_xml(self) -> etree._Element:
if self.colocated_ids:
colocated = E.colocated()
for id_tuple, el_type in self.colocated_ids:
if el_type == EventLinkingType.EXPLICIT:
if el_type == EventLink.EXPLICIT:
colocated.append(
getattr(E, "volume-id")(build_id_from_tuple(id_tuple))
)
Expand Down
6 changes: 3 additions & 3 deletions python/acl_anthology/collections/eventindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from ..utils.ids import AnthologyID, AnthologyIDTuple, parse_id
from ..utils.logging import get_logger
from .event import Event
from .types import EventLinkingType
from .types import EventLink
from .volume import Volume

if TYPE_CHECKING:
Expand Down Expand Up @@ -124,12 +124,12 @@ def load(self) -> None:
event_id,
collection,
is_explicit=False,
colocated_ids=[(volume_fid, EventLinkingType.INFERRED)],
colocated_ids=[(volume_fid, EventLink.INFERRED)],
title=MarkupText.from_string(event_name),
)
else:
# Add implicit connection to existing event
event.add_colocated(volume_fid, EventLinkingType.INFERRED)
event.add_colocated(volume_fid, EventLink.INFERRED)
self.reverse[volume_fid].add(event_id)
except Exception as exc:
log.exception(exc)
Expand Down
1 change: 1 addition & 0 deletions python/acl_anthology/collections/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def create(self, id: str) -> Collection:
path=self.parent.datadir / "xml" / f"{id}.xml",
)
collection.is_data_loaded = True
collection.is_modified = True
self.data[id] = collection

return collection
Loading
Loading