Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/core/janeway_global_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -712,3 +712,9 @@ def __len__(self):
)

ROR_RECORDS_FILE = "https://zenodo.org/api/communities/ror-data/records?sort=newest"

# Chunks ROR bulk_create() inserts so they fit within MySQL's
# default max_allowed_packet (16MB on older servers) and avoid
# 'Server has gone away' errors on large dumps. Operators on a MySQL server
# with a smaller max_allowed_packet may need to lower this value in their local settings.
ROR_BULK_BATCH_SIZE = 1000
31 changes: 23 additions & 8 deletions src/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2367,7 +2367,10 @@ def bulk_create_from_ror(self, ror_records):
if "acronym" in name.get("types"):
kwargs["acronym_for"] = organization
organization_names.append(OrganizationName(**kwargs))
return OrganizationName.objects.bulk_create(organization_names)
return OrganizationName.objects.bulk_create(
organization_names,
batch_size=settings.ROR_BULK_BATCH_SIZE,
)

@transaction.atomic
def bulk_update_from_ror(self, ror_records):
Expand Down Expand Up @@ -2638,7 +2641,10 @@ def bulk_link_locations_from_ror(self, ror_records):
)
)

Organization.locations.through.objects.bulk_create(organization_location_links)
Organization.locations.through.objects.bulk_create(
organization_location_links,
batch_size=settings.ROR_BULK_BATCH_SIZE,
)

def bulk_create_from_ror(self, ror_records):
new_organizations = []
Expand All @@ -2660,7 +2666,10 @@ def bulk_create_from_ror(self, ror_records):
website=website,
)
)
return self.bulk_create(new_organizations)
return self.bulk_create(
new_organizations,
batch_size=settings.ROR_BULK_BATCH_SIZE,
)

@transaction.atomic
def bulk_update_from_ror(self, ror_records):
Expand Down Expand Up @@ -2725,17 +2734,20 @@ def manage_ror_import(self, ror_import, limit=0):
else:
records = json.loads(string)
break
else:
raise ValueError(f"No ROR data file found in {ror_import.zip_path}")

new_records = ror_import.filter_new_records(
records,
self.ror_ids_and_timestamps(),
)
if new_records:
try:
Location.objects.bulk_create_from_ror(new_records)
Organization.objects.bulk_create_from_ror(new_records)
Organization.objects.bulk_link_locations_from_ror(new_records)
OrganizationName.objects.bulk_create_from_ror(new_records)
with transaction.atomic():
Location.objects.bulk_create_from_ror(new_records)
Organization.objects.bulk_create_from_ror(new_records)
Organization.objects.bulk_link_locations_from_ror(new_records)
OrganizationName.objects.bulk_create_from_ror(new_records)
except Exception as error:
message = f"{type(error)}: {error}"
RORImportError.objects.create(
Expand Down Expand Up @@ -3261,7 +3273,10 @@ def bulk_create_from_ror(self, ror_records):
)
)
current_geonames_ids.add(geonames_id)
return Location.objects.bulk_create(new_locations)
return Location.objects.bulk_create(
new_locations,
batch_size=settings.ROR_BULK_BATCH_SIZE,
)

@transaction.atomic
def bulk_update_from_ror(self, ror_records):
Expand Down
75 changes: 75 additions & 0 deletions src/utils/management/commands/clear_ror_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from django.core.management.base import BaseCommand
from django.db import transaction

from core.models import Location, Organization
from utils.logger import get_logger
from utils.models import RORImport, RORImportError


logger = get_logger(__name__)


class Command(BaseCommand):
"""
Deletes ROR-derived Organization, OrganizationName and Location records,
plus all RORImport history. Intended for use before re-running
import_ror_data when the previous import was incomplete or corrupt
(see issue #5248).

Custom (user-created) organizations and their custom_label OrganizationName
rows are preserved. ControlledAffiliation rows pointing at deleted
organizations have their organization FK set to NULL via on_delete=SET_NULL.
"""

help = (
"Delete all ROR-imported Organizations, Locations and import history. "
"Custom organisations and existing affiliations are preserved."
)

def add_arguments(self, parser):
parser.add_argument(
"--no-input",
action="store_true",
help="Do not prompt for confirmation before deleting.",
)
return super().add_arguments(parser)

def handle(self, *args, **options):
ror_orgs = Organization.objects.exclude(ror_id="")
ror_locations = Location.objects.filter(geonames_id__isnull=False)

org_count = ror_orgs.count()
location_count = ror_locations.count()
import_count = RORImport.objects.count()
error_count = RORImportError.objects.count()

self.stdout.write(
self.style.WARNING(
"WARNING: this is a destructive, irreversible operation. "
"ControlledAffiliation rows pointing at deleted organisations "
"will have their organization FK set to NULL."
)
)
self.stdout.write(
f"This will delete:\n"
f" {org_count} ROR Organizations (and their cascade-linked names)\n"
f" {location_count} ROR Locations\n"
f" {import_count} RORImport records\n"
f" {error_count} RORImportError records\n"
)

if not options["no_input"]:
confirm = input(
"Type 'wipe' to confirm you want to delete this data: "
).strip()
if confirm != "wipe":
self.stdout.write("Aborted.")
return

with transaction.atomic():
RORImportError.objects.all().delete()
RORImport.objects.all().delete()
ror_orgs.delete()
ror_locations.delete()

self.stdout.write(self.style.SUCCESS("ROR data cleared."))
Loading