diff --git a/doc/release-notes/12082-permission-indexing-improvements.md b/doc/release-notes/12082-permission-indexing-improvements.md new file mode 100644 index 00000000000..38d71d3f28f --- /dev/null +++ b/doc/release-notes/12082-permission-indexing-improvements.md @@ -0,0 +1,6 @@ +Changes in v6.9 that significantly improved re-indexing performance and lowered memory use in situations +such as when a user's role on the root collection were changed, also slowed reindexing of individual +datasets ater editing and publication. + +This release restores/improves the individual dataset reindexing performance while retaining the +benefits of the earlier update. diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index aec352a615b..7ae239d9a2a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse.search; import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.DataFileServiceBean; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetServiceBean; import edu.harvard.iq.dataverse.DatasetVersion; @@ -53,6 +54,8 @@ public class SolrIndexServiceBean { @EJB SearchPermissionsServiceBean searchPermissionsService; @EJB + DataFileServiceBean dataFileService; + @EJB DataverseServiceBean dataverseService; @EJB DatasetServiceBean datasetService; @@ -365,8 +368,30 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) indexPermissionsForOneDvObject(definitionPoint); numObjects++; - // Process the dataset's files in a new transaction - self.indexDatasetFilesInNewTransaction(dataset.getId(), counter, fileQueryMin); + /** + * Prepare the data needed for the new transaction. For performance reasons, indexDatasetFilesInNewTransaction does not merge the dataset or versions into the new transaction (we only read info, there + * are no changes to write). However, there are two ways the code here is used. In one case, indexing content and permissions, the versions and fileMetadatas in them are already loaded. In the other + * case, indexing permissions only, the fileMetadatas are not yet loaded, and we may need them, but only if there are fewer than fileQueryMin. For each version that will get reindexed (at most two of + * them), the code below does a lightweight query to see how many fileMetadatas exist in it and, if it is equal to or below fileQueryMin, calls getFileMetadatas().size() to assure they are loaded + * (before we pass the version into a new transaction where it will be detached and fileMetadatas can't be loaded). Calling getFileMetadas.size() should be lightweight when the fileMetadatas are + * loaded (first case) and done only when needed for the second case. + * + **/ + Map desiredCards = searchPermissionsService.getDesiredCards(dataset); + List versionsToIndex = new ArrayList<>(); + for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { + if (desiredCards.get(version.getVersionState())) { + int fileCount = dataFileService.findCountByDatasetVersionId(version.getId()).intValue(); + if (fileCount >= fileQueryMin) { + // IMPORTANT: This triggers the loading of fileMetadatas within the current transaction + version.getFileMetadatas().size(); + } + versionsToIndex.add(version); + } + } + + // Process the dataset's files in a new transaction, passing the pre-loaded data + self.indexDatasetFilesInNewTransaction(versionsToIndex, counter, fileQueryMin); } else { // For other types (like files), just index in a new transaction indexPermissionsForOneDvObject(definitionPoint); @@ -399,15 +424,11 @@ public void indexDatasetBatchInNewTransaction(List datasetIds, final int[] } @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) - public void indexDatasetFilesInNewTransaction(Long datasetId, final int[] fileCounter, int fileQueryMin) { - Dataset dataset = datasetService.find(datasetId); - if (dataset != null) { - Map desiredCards = searchPermissionsService.getDesiredCards(dataset); - for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { - if (desiredCards.get(version.getVersionState())) { - processDatasetVersionFiles(version, fileCounter, fileQueryMin); - } - } + public void indexDatasetFilesInNewTransaction(List versions, final int[] fileCounter, int fileQueryMin) { + for (DatasetVersion version : versions) { + // The version object is detached, but its fileMetadatas collection is already loaded. + // We only need its ID and state, which are available. + processDatasetVersionFiles(version, fileCounter, fileQueryMin); } } @@ -421,22 +442,24 @@ private void processDatasetVersionFiles(DatasetVersion version, // Process files in batches of 100 int batchSize = 100; - if (version.getFileMetadatas().size() > fileQueryMin) { + if (dataFileService.findCountByDatasetVersionId(version.getId()).intValue() > fileQueryMin) { // For large datasets, use a more efficient SQL query - Stream fileStream = getDataFileInfoForPermissionIndexing(version.getId()); + try (Stream fileStream = getDataFileInfoForPermissionIndexing(version.getId())) { - // Process files in batches to avoid memory issues - fileStream.forEach(fileInfo -> { - filesToReindexAsBatch.add(fileInfo); - fileCounter[0]++; + // Process files in batches to avoid memory issues + fileStream.forEach(fileInfo -> { + filesToReindexAsBatch.add(fileInfo); + fileCounter[0]++; - if (filesToReindexAsBatch.size() >= batchSize) { - reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); - filesToReindexAsBatch.clear(); - } - }); + if (filesToReindexAsBatch.size() >= batchSize) { + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); + filesToReindexAsBatch.clear(); + } + }); + } } else { // For smaller datasets, process files directly + // We only call getFileMetadatas() in the case where we know they have already been loaded for (FileMetadata fmd : version.getFileMetadatas()) { DataFileProxy fileProxy = new DataFileProxy(fmd); filesToReindexAsBatch.add(fileProxy);