diff --git a/src/core/janeway_global_settings.py b/src/core/janeway_global_settings.py index bc4508ef94..cf4f7ff6ca 100755 --- a/src/core/janeway_global_settings.py +++ b/src/core/janeway_global_settings.py @@ -712,3 +712,9 @@ def __len__(self): ) ROR_RECORDS_FILE = "https://zenodo.org/api/communities/ror-data/records?sort=newest" + +# Chunks ROR bulk_create() inserts so they fit within MySQL's +# default max_allowed_packet (16MB on older servers) and avoid +# 'Server has gone away' errors on large dumps. Operators on a MySQL server +# with a smaller max_allowed_packet may need to lower this value in their local settings. +ROR_BULK_BATCH_SIZE = 1000 diff --git a/src/core/models.py b/src/core/models.py index daaf05d7e6..e340c64202 100644 --- a/src/core/models.py +++ b/src/core/models.py @@ -2367,7 +2367,10 @@ def bulk_create_from_ror(self, ror_records): if "acronym" in name.get("types"): kwargs["acronym_for"] = organization organization_names.append(OrganizationName(**kwargs)) - return OrganizationName.objects.bulk_create(organization_names) + return OrganizationName.objects.bulk_create( + organization_names, + batch_size=settings.ROR_BULK_BATCH_SIZE, + ) @transaction.atomic def bulk_update_from_ror(self, ror_records): @@ -2638,7 +2641,10 @@ def bulk_link_locations_from_ror(self, ror_records): ) ) - Organization.locations.through.objects.bulk_create(organization_location_links) + Organization.locations.through.objects.bulk_create( + organization_location_links, + batch_size=settings.ROR_BULK_BATCH_SIZE, + ) def bulk_create_from_ror(self, ror_records): new_organizations = [] @@ -2660,7 +2666,10 @@ def bulk_create_from_ror(self, ror_records): website=website, ) ) - return self.bulk_create(new_organizations) + return self.bulk_create( + new_organizations, + batch_size=settings.ROR_BULK_BATCH_SIZE, + ) @transaction.atomic def bulk_update_from_ror(self, ror_records): @@ -2725,6 +2734,8 @@ def manage_ror_import(self, ror_import, limit=0): else: records = json.loads(string) break + else: + raise ValueError(f"No ROR data file found in {ror_import.zip_path}") new_records = ror_import.filter_new_records( records, @@ -2732,10 +2743,11 @@ def manage_ror_import(self, ror_import, limit=0): ) if new_records: try: - Location.objects.bulk_create_from_ror(new_records) - Organization.objects.bulk_create_from_ror(new_records) - Organization.objects.bulk_link_locations_from_ror(new_records) - OrganizationName.objects.bulk_create_from_ror(new_records) + with transaction.atomic(): + Location.objects.bulk_create_from_ror(new_records) + Organization.objects.bulk_create_from_ror(new_records) + Organization.objects.bulk_link_locations_from_ror(new_records) + OrganizationName.objects.bulk_create_from_ror(new_records) except Exception as error: message = f"{type(error)}: {error}" RORImportError.objects.create( @@ -3261,7 +3273,10 @@ def bulk_create_from_ror(self, ror_records): ) ) current_geonames_ids.add(geonames_id) - return Location.objects.bulk_create(new_locations) + return Location.objects.bulk_create( + new_locations, + batch_size=settings.ROR_BULK_BATCH_SIZE, + ) @transaction.atomic def bulk_update_from_ror(self, ror_records): diff --git a/src/utils/management/commands/clear_ror_data.py b/src/utils/management/commands/clear_ror_data.py new file mode 100644 index 0000000000..c78c598768 --- /dev/null +++ b/src/utils/management/commands/clear_ror_data.py @@ -0,0 +1,75 @@ +from django.core.management.base import BaseCommand +from django.db import transaction + +from core.models import Location, Organization +from utils.logger import get_logger +from utils.models import RORImport, RORImportError + + +logger = get_logger(__name__) + + +class Command(BaseCommand): + """ + Deletes ROR-derived Organization, OrganizationName and Location records, + plus all RORImport history. Intended for use before re-running + import_ror_data when the previous import was incomplete or corrupt + (see issue #5248). + + Custom (user-created) organizations and their custom_label OrganizationName + rows are preserved. ControlledAffiliation rows pointing at deleted + organizations have their organization FK set to NULL via on_delete=SET_NULL. + """ + + help = ( + "Delete all ROR-imported Organizations, Locations and import history. " + "Custom organisations and existing affiliations are preserved." + ) + + def add_arguments(self, parser): + parser.add_argument( + "--no-input", + action="store_true", + help="Do not prompt for confirmation before deleting.", + ) + return super().add_arguments(parser) + + def handle(self, *args, **options): + ror_orgs = Organization.objects.exclude(ror_id="") + ror_locations = Location.objects.filter(geonames_id__isnull=False) + + org_count = ror_orgs.count() + location_count = ror_locations.count() + import_count = RORImport.objects.count() + error_count = RORImportError.objects.count() + + self.stdout.write( + self.style.WARNING( + "WARNING: this is a destructive, irreversible operation. " + "ControlledAffiliation rows pointing at deleted organisations " + "will have their organization FK set to NULL." + ) + ) + self.stdout.write( + f"This will delete:\n" + f" {org_count} ROR Organizations (and their cascade-linked names)\n" + f" {location_count} ROR Locations\n" + f" {import_count} RORImport records\n" + f" {error_count} RORImportError records\n" + ) + + if not options["no_input"]: + confirm = input( + "Type 'wipe' to confirm you want to delete this data: " + ).strip() + if confirm != "wipe": + self.stdout.write("Aborted.") + return + + with transaction.atomic(): + RORImportError.objects.all().delete() + RORImport.objects.all().delete() + ror_orgs.delete() + ror_locations.delete() + + self.stdout.write(self.style.SUCCESS("ROR data cleared."))