Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/release-notes/12082-permission-indexing-improvements.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
Changes in v6.9 that significantly improved re-indexing performance and lowered memory use in situations
such as when a user's role on the root collection were changed, also slowed reindexing of individual
datasets ater editing and publication.

This release restores/improves the individual dataset reindexing performance while retaining the
benefits of the earlier update.
Original file line number Diff line number Diff line change
Expand Up @@ -92,52 +92,6 @@ public List<String> findDvObjectPerms(DvObject dvObject) {
return permStrings;
}

public Map<DatasetVersion.VersionState, Boolean> getDesiredCards(Dataset dataset) {
Map<DatasetVersion.VersionState, Boolean> desiredCards = new LinkedHashMap<>();
DatasetVersion latestVersion = dataset.getLatestVersion();
DatasetVersion.VersionState latestVersionState = latestVersion.getVersionState();
DatasetVersion releasedVersion = dataset.getReleasedVersion();
boolean atLeastOnePublishedVersion = false;
if (releasedVersion != null) {
atLeastOnePublishedVersion = true;
} else {
atLeastOnePublishedVersion = false;
}

if (atLeastOnePublishedVersion == false) {
if (latestVersionState.equals(DatasetVersion.VersionState.DRAFT)) {
desiredCards.put(DatasetVersion.VersionState.DRAFT, true);
desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false);
desiredCards.put(DatasetVersion.VersionState.RELEASED, false);
} else if (latestVersionState.equals(DatasetVersion.VersionState.DEACCESSIONED)) {
desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, true);
desiredCards.put(DatasetVersion.VersionState.RELEASED, false);
desiredCards.put(DatasetVersion.VersionState.DRAFT, false);
} else {
String msg = "No-op. Unexpected condition reached: There is no published version and the latest published version is neither " + DatasetVersion.VersionState.DRAFT + " nor " + DatasetVersion.VersionState.DEACCESSIONED + ". Its state is " + latestVersionState + ".";
logger.info(msg);
}
} else if (atLeastOnePublishedVersion == true) {
if (latestVersionState.equals(DatasetVersion.VersionState.RELEASED)
|| latestVersionState.equals(DatasetVersion.VersionState.DEACCESSIONED)) {
desiredCards.put(DatasetVersion.VersionState.RELEASED, true);
desiredCards.put(DatasetVersion.VersionState.DRAFT, false);
desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false);
} else if (latestVersionState.equals(DatasetVersion.VersionState.DRAFT)) {
desiredCards.put(DatasetVersion.VersionState.DRAFT, true);
desiredCards.put(DatasetVersion.VersionState.RELEASED, true);
desiredCards.put(DatasetVersion.VersionState.DEACCESSIONED, false);
} else {
String msg = "No-op. Unexpected condition reached: There is at least one published version but the latest version is neither published nor draft";
logger.info(msg);
}
} else {
String msg = "No-op. Unexpected condition reached: Has a version been published or not?";
logger.info(msg);
}
return desiredCards;
}

private boolean hasBeenPublished(Dataverse dataverse) {
return dataverse.isReleased();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package edu.harvard.iq.dataverse.search;

import edu.harvard.iq.dataverse.DataFile;
import edu.harvard.iq.dataverse.DataFileServiceBean;
import edu.harvard.iq.dataverse.Dataset;
import edu.harvard.iq.dataverse.DatasetServiceBean;
import edu.harvard.iq.dataverse.DatasetVersion;
Expand Down Expand Up @@ -53,6 +54,8 @@ public class SolrIndexServiceBean {
@EJB
SearchPermissionsServiceBean searchPermissionsService;
@EJB
DataFileServiceBean dataFileService;
@EJB
DataverseServiceBean dataverseService;
@EJB
DatasetServiceBean datasetService;
Expand Down Expand Up @@ -83,15 +86,14 @@ public List<DvObjectSolrDoc> determineSolrDocs(DvObject dvObject) {
solrDocs.addAll(datasetSolrDocs);
} else if (dvObject.isInstanceofDataFile()) {
DataFile datafile = (DataFile) dvObject;
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(datafile.getOwner());
Set<DatasetVersion> datasetVersions = datasetVersionsToBuildCardsFor(datafile.getOwner());
for (DatasetVersion version : datasetVersions) {
if(desiredCards.containsKey(version.getVersionState()) && desiredCards.get(version.getVersionState()) && datafile.isInDatasetVersion(version)) {
List<String> cachedPerms = searchPermissionsService.findDatasetVersionPerms(version);
String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState());
Long versionId = version.getId();
DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(new DataFileProxy(datafile.getFileMetadata()), cachedPerms, versionId, solrIdEnd);
solrDocs.add(fileSolrDoc);
if (datafile.isInDatasetVersion(version)) {
List<String> cachedPerms = searchPermissionsService.findDatasetVersionPerms(version);
String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState());
Long versionId = version.getId();
DvObjectSolrDoc fileSolrDoc = constructDatafileSolrDoc(new DataFileProxy(datafile.getFileMetadata()), cachedPerms, versionId, solrIdEnd);
solrDocs.add(fileSolrDoc);
}
}
} else {
Expand Down Expand Up @@ -133,13 +135,9 @@ private DvObjectSolrDoc constructDataverseSolrDoc(Dataverse dataverse) {
private List<DvObjectSolrDoc> constructDatasetSolrDocs(Dataset dataset) {
List<DvObjectSolrDoc> emptyList = new ArrayList<>();
List<DvObjectSolrDoc> solrDocs = emptyList;
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataset);
for (DatasetVersion version : datasetVersionsToBuildCardsFor(dataset)) {
boolean cardShouldExist = desiredCards.get(version.getVersionState());
if (cardShouldExist) {
DvObjectSolrDoc datasetSolrDoc = makeDatasetSolrDoc(version);
solrDocs.add(datasetSolrDoc);
}
DvObjectSolrDoc datasetSolrDoc = makeDatasetSolrDoc(version);
solrDocs.add(datasetSolrDoc);
}
return solrDocs;
}
Expand All @@ -158,39 +156,45 @@ private DvObjectSolrDoc constructDatafileSolrDoc(DataFileProxy fileProxy, List<S

private List<DvObjectSolrDoc> constructDatafileSolrDocsFromDataset(Dataset dataset) {
List<DvObjectSolrDoc> datafileSolrDocs = new ArrayList<>();
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataset);
for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataset)) {
boolean cardShouldExist = desiredCards.get(datasetVersionFileIsAttachedTo.getVersionState());
if (cardShouldExist) {
List<String> perms = new ArrayList<>();
if (datasetVersionFileIsAttachedTo.isReleased()) {
perms.add(IndexServiceBean.getPublicGroupString());
} else {
perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo);
}
List<String> perms = new ArrayList<>();
if (datasetVersionFileIsAttachedTo.isReleased()) {
perms.add(IndexServiceBean.getPublicGroupString());
} else {
perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo);
}

for (FileMetadata fileMetadata : datasetVersionFileIsAttachedTo.getFileMetadatas()) {
Long fileId = fileMetadata.getDataFile().getId();
String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileId;
String solrIdEnd = getDatasetOrDataFileSolrEnding(datasetVersionFileIsAttachedTo.getVersionState());
String solrId = solrIdStart + solrIdEnd;
DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(fileId.toString(), solrId, datasetVersionFileIsAttachedTo.getId(), fileMetadata.getLabel(), perms);
logger.finest("adding fileid " + fileId);
datafileSolrDocs.add(dataFileSolrDoc);
}
for (FileMetadata fileMetadata : datasetVersionFileIsAttachedTo.getFileMetadatas()) {
Long fileId = fileMetadata.getDataFile().getId();
String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileId;
String solrIdEnd = getDatasetOrDataFileSolrEnding(datasetVersionFileIsAttachedTo.getVersionState());
String solrId = solrIdStart + solrIdEnd;
DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(fileId.toString(), solrId, datasetVersionFileIsAttachedTo.getId(), fileMetadata.getLabel(), perms);
logger.finest("adding fileid " + fileId);
datafileSolrDocs.add(dataFileSolrDoc);
}
}
return datafileSolrDocs;
}

/** Find the versions to index. The overall logic is
* If there is only one version, or no released version (all non-draft versions are deaccessioned)
* then index it regardless of it's versionstate
* If there are released versions
* then index the latest released version and a draft version if one exists
* Hence - the latest deaccessioned version is only indexed if there is no released version
* @param dataset
* @return the set of versions to build cards for
*/
private Set<DatasetVersion> datasetVersionsToBuildCardsFor(Dataset dataset) {
Set<DatasetVersion> datasetVersions = new HashSet<>();
DatasetVersion latest = dataset.getLatestVersion();
if (latest != null) {
DatasetVersion released = dataset.getReleasedVersion();
if (latest != null && (released == null || latest.isDraft())) {
datasetVersions.add(latest);
}
DatasetVersion released = dataset.getReleasedVersion();
if (released != null) {
//May be the same as the latest version - only one copy will be in the set in that case
datasetVersions.add(released);
}
return datasetVersions;
Expand Down Expand Up @@ -365,8 +369,27 @@ public IndexResponse indexPermissionsOnSelfAndChildren(DvObject definitionPoint)
indexPermissionsForOneDvObject(definitionPoint);
numObjects++;

// Process the dataset's files in a new transaction
self.indexDatasetFilesInNewTransaction(dataset.getId(), counter, fileQueryMin);
/**
* Prepare the data needed for the new transaction. For performance reasons, indexDatasetFilesInNewTransaction does not merge the dataset or versions into the new transaction (we only read info, there
* are no changes to write). However, there are two ways the code here is used. In one case, indexing content and permissions, the versions and fileMetadatas in them are already loaded. In the other
* case, indexing permissions only, the fileMetadatas are not yet loaded, and we may need them, but only if there are fewer than fileQueryMin. For each version that will get reindexed (at most two of
* them), the code below does a lightweight query to see how many fileMetadatas exist in it and, if it is equal to or below fileQueryMin, calls getFileMetadatas().size() to assure they are loaded
* (before we pass the version into a new transaction where it will be detached and fileMetadatas can't be loaded). Calling getFileMetadas.size() should be lightweight when the fileMetadatas are
* loaded (first case) and done only when needed for the second case.
*
**/
List<DatasetVersion> versionsToIndex = new ArrayList<>();
for (DatasetVersion version : datasetVersionsToBuildCardsFor(dataset)) {
int fileCount = dataFileService.findCountByDatasetVersionId(version.getId()).intValue();
if (fileCount >= fileQueryMin) {
// IMPORTANT: This triggers the loading of fileMetadatas within the current transaction
version.getFileMetadatas().size();
}
versionsToIndex.add(version);
}

// Process the dataset's files in a new transaction, passing the pre-loaded data
self.indexDatasetFilesInNewTransaction(versionsToIndex, counter, fileQueryMin);
} else {
// For other types (like files), just index in a new transaction
indexPermissionsForOneDvObject(definitionPoint);
Expand All @@ -387,27 +410,19 @@ public void indexDatasetBatchInNewTransaction(List<Long> datasetIds, final int[]
indexPermissionsForOneDvObject(dataset);

// Process files for this dataset
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataset);

for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) {
if (desiredCards.get(version.getVersionState())) {
for (DatasetVersion version : datasetVersionsToBuildCardsFor(dataset)) {
processDatasetVersionFiles(version, fileCounter, fileQueryMin);
}
}
}
}
}

@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
public void indexDatasetFilesInNewTransaction(Long datasetId, final int[] fileCounter, int fileQueryMin) {
Dataset dataset = datasetService.find(datasetId);
if (dataset != null) {
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataset);
for (DatasetVersion version : versionsToReIndexPermissionsFor(dataset)) {
if (desiredCards.get(version.getVersionState())) {
processDatasetVersionFiles(version, fileCounter, fileQueryMin);
}
}
public void indexDatasetFilesInNewTransaction(List<DatasetVersion> versions, final int[] fileCounter, int fileQueryMin) {
for (DatasetVersion version : versions) {
// The version object is detached, but its fileMetadatas collection is already loaded.
// We only need its ID and state, which are available.
processDatasetVersionFiles(version, fileCounter, fileQueryMin);
}
}

Expand All @@ -421,22 +436,24 @@ private void processDatasetVersionFiles(DatasetVersion version,
// Process files in batches of 100
int batchSize = 100;

if (version.getFileMetadatas().size() > fileQueryMin) {
if (dataFileService.findCountByDatasetVersionId(version.getId()).intValue() > fileQueryMin) {
// For large datasets, use a more efficient SQL query
Stream<DataFileProxy> fileStream = getDataFileInfoForPermissionIndexing(version.getId());
try (Stream<DataFileProxy> fileStream = getDataFileInfoForPermissionIndexing(version.getId())) {

// Process files in batches to avoid memory issues
fileStream.forEach(fileInfo -> {
filesToReindexAsBatch.add(fileInfo);
fileCounter[0]++;
// Process files in batches to avoid memory issues
fileStream.forEach(fileInfo -> {
filesToReindexAsBatch.add(fileInfo);
fileCounter[0]++;

if (filesToReindexAsBatch.size() >= batchSize) {
reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd);
filesToReindexAsBatch.clear();
}
});
if (filesToReindexAsBatch.size() >= batchSize) {
reindexFilesInBatches(filesToReindexAsBatch, cachedPerms, versionId, solrIdEnd);
filesToReindexAsBatch.clear();
}
});
}
} else {
// For smaller datasets, process files directly
// We only call getFileMetadatas() in the case where we know they have already been loaded
for (FileMetadata fmd : version.getFileMetadatas()) {
DataFileProxy fileProxy = new DataFileProxy(fmd);
filesToReindexAsBatch.add(fileProxy);
Expand Down Expand Up @@ -477,18 +494,6 @@ private void reindexFilesInBatches(List<DataFileProxy> filesToReindexAsBatch, Li
}
}

private List<DatasetVersion> versionsToReIndexPermissionsFor(Dataset dataset) {
List<DatasetVersion> versionsToReindexPermissionsFor = new ArrayList<>();
Map<DatasetVersion.VersionState, Boolean> desiredCards = searchPermissionsService.getDesiredCards(dataset);
for (DatasetVersion version : datasetVersionsToBuildCardsFor(dataset)) {
boolean cardShouldExist = desiredCards.get(version.getVersionState());
if (cardShouldExist) {
versionsToReindexPermissionsFor.add(version);
}
}
return versionsToReindexPermissionsFor;
}

public IndexResponse deleteMultipleSolrIds(List<String> solrIdsToDelete) {
if (solrIdsToDelete.isEmpty()) {
return new IndexResponse("nothing to delete");
Expand Down
Loading