diff --git a/doc/release-notes/12082-permission-indexing-improvements.md b/doc/release-notes/12082-permission-indexing-improvements.md new file mode 100644 index 00000000000..38d71d3f28f --- /dev/null +++ b/doc/release-notes/12082-permission-indexing-improvements.md @@ -0,0 +1,6 @@ +Changes in v6.9 that significantly improved re-indexing performance and lowered memory use in situations +such as when a user's role on the root collection were changed, also slowed reindexing of individual +datasets ater editing and publication. + +This release restores/improves the individual dataset reindexing performance while retaining the +benefits of the earlier update. diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java index c25a462efab..921bf97496f 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SearchPermissionsServiceBean.java @@ -92,52 +92,6 @@ public List findDvObjectPerms(DvObject dvObject) { return permStrings; } - public Map getDesiredCards(Dataset dataset) { - Map desiredCards = new LinkedHashMap<>(); - DatasetVersion latestVersion = dataset.getLatestVersion(); - DatasetVersion.VersionState latestVersionState = latestVersion.getVersionState(); - DatasetVersion releasedVersion = dataset.getReleasedVersion(); - boolean atLeastOnePublishedVersion = false; - if (releasedVersion != null) { - atLeastOnePublishedVersion = true; - } else { - atLeastOnePublishedVersion = false; - } - - if (atLeastOnePublishedVersion == false) { - if (latestVersionState.equals(DatasetVersion.VersionState.DRAFT)) { - desiredCards.put(DatasetVersion.VersionState.DRAFT, true); - desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); - desiredCards.put(DatasetVersion.VersionState.RELEASED, false); - } else if (latestVersionState.equals(DatasetVersion.VersionState.DEACCESSIONED)) { - desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, true); - desiredCards.put(DatasetVersion.VersionState.RELEASED, false); - desiredCards.put(DatasetVersion.VersionState.DRAFT, false); - } else { - String msg = "No-op. Unexpected condition reached: There is no published version and the latest published version is neither " + DatasetVersion.VersionState.DRAFT + " nor " + DatasetVersion.VersionState.DEACCESSIONED + ". Its state is " + latestVersionState + "."; - logger.info(msg); - } - } else if (atLeastOnePublishedVersion == true) { - if (latestVersionState.equals(DatasetVersion.VersionState.RELEASED) - || latestVersionState.equals(DatasetVersion.VersionState.DEACCESSIONED)) { - desiredCards.put(DatasetVersion.VersionState.RELEASED, true); - desiredCards.put(DatasetVersion.VersionState.DRAFT, false); - desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); - } else if (latestVersionState.equals(DatasetVersion.VersionState.DRAFT)) { - desiredCards.put(DatasetVersion.VersionState.DRAFT, true); - desiredCards.put(DatasetVersion.VersionState.RELEASED, true); - desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false); - } else { - String msg = "No-op. Unexpected condition reached: There is at least one published version but the latest version is neither published nor draft"; - logger.info(msg); - } - } else { - String msg = "No-op. Unexpected condition reached: Has a version been published or not?"; - logger.info(msg); - } - return desiredCards; - } - private boolean hasBeenPublished(Dataverse dataverse) { return dataverse.isReleased(); } diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index aec352a615b..e2e307e3ced 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse.search; import edu.harvard.iq.dataverse.DataFile; +import edu.harvard.iq.dataverse.DataFileServiceBean; import edu.harvard.iq.dataverse.Dataset; import edu.harvard.iq.dataverse.DatasetServiceBean; import edu.harvard.iq.dataverse.DatasetVersion; @@ -53,6 +54,8 @@ public class SolrIndexServiceBean { @EJB SearchPermissionsServiceBean searchPermissionsService; @EJB + DataFileServiceBean dataFileService; + @EJB DataverseServiceBean dataverseService; @EJB DatasetServiceBean datasetService; @@ -83,15 +86,14 @@ public List determineSolrDocs(DvObject dvObject) { solrDocs.addAll(datasetSolrDocs); } else if (dvObject.isInstanceofDataFile()) { DataFile datafile = (DataFile) dvObject; - Map desiredCards = searchPermissionsService.getDesiredCards(datafile.getOwner()); Set datasetVersions = datasetVersionsToBuildCardsFor(datafile.getOwner()); for (DatasetVersion version : datasetVersions) { - if(desiredCards.containsKey(version.getVersionState()) && desiredCards.get(version.getVersionState()) && datafile.isInDatasetVersion(version)) { - List cachedPerms = searchPermissionsService.findDatasetVersionPerms(version); - String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); - Long versionId = version.getId(); - DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(new DataFileProxy(datafile.getFileMetadata()), cachedPerms, versionId, solrIdEnd); - solrDocs.add(fileSolrDoc); + if (datafile.isInDatasetVersion(version)) { + List cachedPerms = searchPermissionsService.findDatasetVersionPerms(version); + String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState()); + Long versionId = version.getId(); + DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(new DataFileProxy(datafile.getFileMetadata()), cachedPerms, versionId, solrIdEnd); + solrDocs.add(fileSolrDoc); } } } else { @@ -133,13 +135,9 @@ private DvObjectSolrDoc constructDataverseSolrDoc(Dataverse dataverse) { private List constructDatasetSolrDocs(Dataset dataset) { List emptyList = new ArrayList<>(); List solrDocs = emptyList; - Map desiredCards = searchPermissionsService.getDesiredCards(dataset); for (DatasetVersion version : datasetVersionsToBuildCardsFor(dataset)) { - boolean cardShouldExist = desiredCards.get(version.getVersionState()); - if (cardShouldExist) { - DvObjectSolrDoc datasetSolrDoc = makeDatasetSolrDoc(version); - solrDocs.add(datasetSolrDoc); - } + DvObjectSolrDoc datasetSolrDoc = makeDatasetSolrDoc(version); + solrDocs.add(datasetSolrDoc); } return solrDocs; } @@ -158,39 +156,45 @@ private DvObjectSolrDoc constructDatafileSolrDoc(DataFileProxy fileProxy, List constructDatafileSolrDocsFromDataset(Dataset dataset) { List datafileSolrDocs = new ArrayList<>(); - Map desiredCards = searchPermissionsService.getDesiredCards(dataset); for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataset)) { - boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState()); - if (cardShouldExist) { - List perms = new ArrayList<>(); - if (datasetVersionFileIsAttachedTo.isReleased()) { - perms.add(IndexServiceBean.getPublicGroupString()); - } else { - perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo); - } + List perms = new ArrayList<>(); + if (datasetVersionFileIsAttachedTo.isReleased()) { + perms.add(IndexServiceBean.getPublicGroupString()); + } else { + perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo); + } - for (FileMetadata fileMetadata : datasetVersionFileIsAttachedTo.getFileMetadatas()) { - Long fileId = fileMetadata.getDataFile().getId(); - String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileId; - String solrIdEnd = getDatasetOrDataFileSolrEnding(datasetVersionFileIsAttachedTo.getVersionState()); - String solrId = solrIdStart + solrIdEnd; - DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(fileId.toString(), solrId, datasetVersionFileIsAttachedTo.getId(), fileMetadata.getLabel(), perms); - logger.finest("adding fileid " + fileId); - datafileSolrDocs.add(dataFileSolrDoc); - } + for (FileMetadata fileMetadata : datasetVersionFileIsAttachedTo.getFileMetadatas()) { + Long fileId = fileMetadata.getDataFile().getId(); + String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileId; + String solrIdEnd = getDatasetOrDataFileSolrEnding(datasetVersionFileIsAttachedTo.getVersionState()); + String solrId = solrIdStart + solrIdEnd; + DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(fileId.toString(), solrId, datasetVersionFileIsAttachedTo.getId(), fileMetadata.getLabel(), perms); + logger.finest("adding fileid " + fileId); + datafileSolrDocs.add(dataFileSolrDoc); } } return datafileSolrDocs; } + /** Find the versions to index. The overall logic is + * If there is only one version, or no released version (all non-draft versions are deaccessioned) + * then index it regardless of it's versionstate + * If there are released versions + * then index the latest released version and a draft version if one exists + * Hence - the latest deaccessioned version is only indexed if there is no released version + * @param dataset + * @return the set of versions to build cards for + */ private Set datasetVersionsToBuildCardsFor(Dataset dataset) { Set datasetVersions = new HashSet<>(); DatasetVersion latest = dataset.getLatestVersion(); - if (latest != null) { + DatasetVersion released = dataset.getReleasedVersion(); + if (latest != null && (released == null || latest.isDraft())) { datasetVersions.add(latest); } - DatasetVersion released = dataset.getReleasedVersion(); if (released != null) { + //May be the same as the latest version - only one copy will be in the set in that case datasetVersions.add(released); } return datasetVersions; @@ -365,8 +369,27 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint) indexPermissionsForOneDvObject(definitionPoint); numObjects++; - // Process the dataset's files in a new transaction - self.indexDatasetFilesInNewTransaction(dataset.getId(), counter, fileQueryMin); + /** + * Prepare the data needed for the new transaction. For performance reasons, indexDatasetFilesInNewTransaction does not merge the dataset or versions into the new transaction (we only read info, there + * are no changes to write). However, there are two ways the code here is used. In one case, indexing content and permissions, the versions and fileMetadatas in them are already loaded. In the other + * case, indexing permissions only, the fileMetadatas are not yet loaded, and we may need them, but only if there are fewer than fileQueryMin. For each version that will get reindexed (at most two of + * them), the code below does a lightweight query to see how many fileMetadatas exist in it and, if it is equal to or below fileQueryMin, calls getFileMetadatas().size() to assure they are loaded + * (before we pass the version into a new transaction where it will be detached and fileMetadatas can't be loaded). Calling getFileMetadas.size() should be lightweight when the fileMetadatas are + * loaded (first case) and done only when needed for the second case. + * + **/ + List versionsToIndex = new ArrayList<>(); + for (DatasetVersion version : datasetVersionsToBuildCardsFor(dataset)) { + int fileCount = dataFileService.findCountByDatasetVersionId(version.getId()).intValue(); + if (fileCount >= fileQueryMin) { + // IMPORTANT: This triggers the loading of fileMetadatas within the current transaction + version.getFileMetadatas().size(); + } + versionsToIndex.add(version); + } + + // Process the dataset's files in a new transaction, passing the pre-loaded data + self.indexDatasetFilesInNewTransaction(versionsToIndex, counter, fileQueryMin); } else { // For other types (like files), just index in a new transaction indexPermissionsForOneDvObject(definitionPoint); @@ -387,27 +410,19 @@ public void indexDatasetBatchInNewTransaction(List datasetIds, final int[] indexPermissionsForOneDvObject(dataset); // Process files for this dataset - Map desiredCards = searchPermissionsService.getDesiredCards(dataset); - - for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { - if (desiredCards.get(version.getVersionState())) { + for (DatasetVersion version : datasetVersionsToBuildCardsFor(dataset)) { processDatasetVersionFiles(version, fileCounter, fileQueryMin); } } } } - } @TransactionAttribute(TransactionAttributeType.REQUIRES_NEW) - public void indexDatasetFilesInNewTransaction(Long datasetId, final int[] fileCounter, int fileQueryMin) { - Dataset dataset = datasetService.find(datasetId); - if (dataset != null) { - Map desiredCards = searchPermissionsService.getDesiredCards(dataset); - for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) { - if (desiredCards.get(version.getVersionState())) { - processDatasetVersionFiles(version, fileCounter, fileQueryMin); - } - } + public void indexDatasetFilesInNewTransaction(List versions, final int[] fileCounter, int fileQueryMin) { + for (DatasetVersion version : versions) { + // The version object is detached, but its fileMetadatas collection is already loaded. + // We only need its ID and state, which are available. + processDatasetVersionFiles(version, fileCounter, fileQueryMin); } } @@ -421,22 +436,24 @@ private void processDatasetVersionFiles(DatasetVersion version, // Process files in batches of 100 int batchSize = 100; - if (version.getFileMetadatas().size() > fileQueryMin) { + if (dataFileService.findCountByDatasetVersionId(version.getId()).intValue() > fileQueryMin) { // For large datasets, use a more efficient SQL query - Stream fileStream = getDataFileInfoForPermissionIndexing(version.getId()); + try (Stream fileStream = getDataFileInfoForPermissionIndexing(version.getId())) { - // Process files in batches to avoid memory issues - fileStream.forEach(fileInfo -> { - filesToReindexAsBatch.add(fileInfo); - fileCounter[0]++; + // Process files in batches to avoid memory issues + fileStream.forEach(fileInfo -> { + filesToReindexAsBatch.add(fileInfo); + fileCounter[0]++; - if (filesToReindexAsBatch.size() >= batchSize) { - reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); - filesToReindexAsBatch.clear(); - } - }); + if (filesToReindexAsBatch.size() >= batchSize) { + reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd); + filesToReindexAsBatch.clear(); + } + }); + } } else { // For smaller datasets, process files directly + // We only call getFileMetadatas() in the case where we know they have already been loaded for (FileMetadata fmd : version.getFileMetadatas()) { DataFileProxy fileProxy = new DataFileProxy(fmd); filesToReindexAsBatch.add(fileProxy); @@ -477,18 +494,6 @@ private void reindexFilesInBatches(List filesToReindexAsBatch, Li } } - private List versionsToReIndexPermissionsFor(Dataset dataset) { - List versionsToReindexPermissionsFor = new ArrayList<>(); - Map desiredCards = searchPermissionsService.getDesiredCards(dataset); - for (DatasetVersion version : datasetVersionsToBuildCardsFor(dataset)) { - boolean cardShouldExist = desiredCards.get(version.getVersionState()); - if (cardShouldExist) { - versionsToReindexPermissionsFor.add(version); - } - } - return versionsToReindexPermissionsFor; - } - public IndexResponse deleteMultipleSolrIds(List solrIdsToDelete) { if (solrIdsToDelete.isEmpty()) { return new IndexResponse("nothing to delete"); diff --git a/src/test/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBeanTest.java b/src/test/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBeanTest.java new file mode 100644 index 00000000000..f04d5a7896c --- /dev/null +++ b/src/test/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBeanTest.java @@ -0,0 +1,140 @@ + +package edu.harvard.iq.dataverse.search; + +import edu.harvard.iq.dataverse.Dataset; +import edu.harvard.iq.dataverse.DatasetVersion; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.InjectMocks; +import org.mockito.MockitoAnnotations; + +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +public class SolrIndexServiceBeanTest { + + @InjectMocks + private SolrIndexServiceBean solrIndexServiceBean; + + @BeforeEach + public void setUp() { + MockitoAnnotations.openMocks(this); + } + + @Test + public void testDatasetVersionsToBuildCardsFor_OnlyDraft() { + // Arrange + Dataset dataset = mock(Dataset.class); + DatasetVersion draftVersion = createMockVersion(1L, DatasetVersion.VersionState.DRAFT, true); + + when(dataset.getLatestVersion()).thenReturn(draftVersion); + when(dataset.getReleasedVersion()).thenReturn(null); + + // Act + Set result = invokeDatasetVersionsToBuildCardsFor(dataset); + + // Assert + assertEquals(1, result.size()); + assertTrue(result.contains(draftVersion)); + } + + @Test + public void testDatasetVersionsToBuildCardsFor_OnlyDeaccessioned() { + // Arrange + Dataset dataset = mock(Dataset.class); + DatasetVersion deaccessionedVersion = createMockVersion(1L, DatasetVersion.VersionState.DEACCESSIONED, false); + + when(dataset.getLatestVersion()).thenReturn(deaccessionedVersion); + when(dataset.getReleasedVersion()).thenReturn(null); + + // Act + Set result = invokeDatasetVersionsToBuildCardsFor(dataset); + + // Assert + assertEquals(1, result.size()); + assertTrue(result.contains(deaccessionedVersion)); + } + + @Test + public void testDatasetVersionsToBuildCardsFor_OnlyReleased() { + // Arrange + Dataset dataset = mock(Dataset.class); + DatasetVersion releasedVersion = createMockVersion(1L, DatasetVersion.VersionState.RELEASED, false); + + when(dataset.getLatestVersion()).thenReturn(releasedVersion); + when(dataset.getReleasedVersion()).thenReturn(releasedVersion); + + // Act + Set result = invokeDatasetVersionsToBuildCardsFor(dataset); + + // Assert + assertEquals(1, result.size()); + assertTrue(result.contains(releasedVersion)); + } + + @Test + public void testDatasetVersionsToBuildCardsFor_ReleasedAndDraft() { + // Arrange + Dataset dataset = mock(Dataset.class); + DatasetVersion releasedVersion = createMockVersion(1L, DatasetVersion.VersionState.RELEASED, false); + DatasetVersion draftVersion = createMockVersion(2L, DatasetVersion.VersionState.DRAFT, true); + + when(dataset.getLatestVersion()).thenReturn(draftVersion); + when(dataset.getReleasedVersion()).thenReturn(releasedVersion); + + // Act + Set result = invokeDatasetVersionsToBuildCardsFor(dataset); + + // Assert + assertEquals(2, result.size()); + assertTrue(result.contains(releasedVersion)); + assertTrue(result.contains(draftVersion)); + } + + @Test + public void testDatasetVersionsToBuildCardsFor_ReleasedAndDeaccessioned() { + // Arrange + Dataset dataset = mock(Dataset.class); + DatasetVersion releasedVersion = createMockVersion(1L, DatasetVersion.VersionState.RELEASED, false); + DatasetVersion deaccessionedVersion = createMockVersion(2L, DatasetVersion.VersionState.DEACCESSIONED, false); + + // Latest is deaccessioned, but there's a released version + when(dataset.getLatestVersion()).thenReturn(deaccessionedVersion); + when(dataset.getReleasedVersion()).thenReturn(releasedVersion); + + // Act + Set result = invokeDatasetVersionsToBuildCardsFor(dataset); + + // Assert + // Should only return the released version, not the deaccessioned one + assertEquals(1, result.size()); + assertTrue(result.contains(releasedVersion)); + assertFalse(result.contains(deaccessionedVersion)); + } + + // Helper method to create mock DatasetVersion + private DatasetVersion createMockVersion(Long id, DatasetVersion.VersionState state, boolean isDraft) { + DatasetVersion version = mock(DatasetVersion.class); + when(version.getId()).thenReturn(id); + when(version.getVersionState()).thenReturn(state); + when(version.isDraft()).thenReturn(isDraft); + when(version.isReleased()).thenReturn(state == DatasetVersion.VersionState.RELEASED); + when(version.isDeaccessioned()).thenReturn(state == DatasetVersion.VersionState.DEACCESSIONED); + return version; + } + + // Helper method to invoke the private method using reflection + @SuppressWarnings("unchecked") + private Set invokeDatasetVersionsToBuildCardsFor(Dataset dataset) { + try { + java.lang.reflect.Method method = SolrIndexServiceBean.class.getDeclaredMethod( + "datasetVersionsToBuildCardsFor", Dataset.class); + method.setAccessible(true); + return (Set) method.invoke(solrIndexServiceBean, dataset); + } catch (Exception e) { + throw new RuntimeException("Failed to invoke private method", e); + } + } +} \ No newline at end of file