diff --git a/.github/actions/setup-maven/action.yml b/.github/actions/setup-maven/action.yml index e3e0de47329..62ec7020b5b 100644 --- a/.github/actions/setup-maven/action.yml +++ b/.github/actions/setup-maven/action.yml @@ -23,7 +23,7 @@ runs: echo "JAVA_VERSION=$(grep '' ${GITHUB_WORKSPACE}/modules/dataverse-parent/pom.xml | cut -f2 -d'>' | cut -f1 -d'<')" | tee -a ${GITHUB_ENV} - name: Set up JDK ${{ env.JAVA_VERSION }} id: setup-java - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: ${{ env.JAVA_VERSION }} distribution: 'temurin' diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 105469139ec..907452f4614 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -71,7 +71,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@v3 + uses: github/codeql-action/init@v4 with: languages: ${{ matrix.language }} build-mode: ${{ matrix.build-mode }} @@ -99,6 +99,6 @@ jobs: exit 1 - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v3 + uses: github/codeql-action/analyze@v4 with: category: "/language:${{matrix.language}}" diff --git a/.github/workflows/container_app_pr.yml b/.github/workflows/container_app_pr.yml index a5dddc755d1..898b46c2652 100644 --- a/.github/workflows/container_app_pr.yml +++ b/.github/workflows/container_app_pr.yml @@ -23,7 +23,7 @@ jobs: - uses: actions/checkout@v5 with: ref: 'refs/pull/${{ github.event.client_payload.pull_request.number }}/merge' - - uses: actions/setup-java@v4 + - uses: actions/setup-java@v5 with: java-version: "17" distribution: 'adopt' @@ -86,7 +86,7 @@ jobs: :ship: [See on GHCR](https://github.com/orgs/gdcc/packages/container). Use by referencing with full name as printed above, mind the registry name. # Leave a note when things have gone sideways - - uses: peter-evans/create-or-update-comment@v4 + - uses: peter-evans/create-or-update-comment@v5 if: ${{ failure() }} with: issue-number: ${{ github.event.client_payload.pull_request.number }} diff --git a/.github/workflows/container_maintenance.yml b/.github/workflows/container_maintenance.yml index 142363cbe1a..d863f838881 100644 --- a/.github/workflows/container_maintenance.yml +++ b/.github/workflows/container_maintenance.yml @@ -218,7 +218,7 @@ jobs: cat "./modules/container-base/README.md" - name: Push description to DockerHub for base image if: ${{ ! inputs.dry_run && ! inputs.damp_run && toJSON(needs.base-image.outputs.rebuilt_images) != '[]' }} - uses: peter-evans/dockerhub-description@v4 + uses: peter-evans/dockerhub-description@v5 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} @@ -243,7 +243,7 @@ jobs: cat "./src/main/docker/README.md" - name: Push description to DockerHub for application image if: ${{ ! inputs.dry_run && ! inputs.damp_run && toJSON(needs.application-image.outputs.rebuilt_images) != '[]' }} - uses: peter-evans/dockerhub-description@v4 + uses: peter-evans/dockerhub-description@v5 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} @@ -268,7 +268,7 @@ jobs: cat "./modules/container-configbaker/README.md" - name: Push description to DockerHub for config baker image if: ${{ ! inputs.dry_run && ! inputs.damp_run && toJSON(needs.configbaker-image.outputs.rebuilt_images) != '[]' }} - uses: peter-evans/dockerhub-description@v4 + uses: peter-evans/dockerhub-description@v5 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} diff --git a/.github/workflows/deploy_beta_testing.yml b/.github/workflows/deploy_beta_testing.yml index 0e060113ba0..48fd5c80d3b 100644 --- a/.github/workflows/deploy_beta_testing.yml +++ b/.github/workflows/deploy_beta_testing.yml @@ -16,7 +16,7 @@ jobs: steps: - uses: actions/checkout@v5 - - uses: actions/setup-java@v4 + - uses: actions/setup-java@v5 with: distribution: 'zulu' java-version: '17' @@ -36,7 +36,7 @@ jobs: run: echo "war_file=$(ls *.war | head -1)">> $GITHUB_ENV - name: Upload war artifact - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v5 with: name: built-app path: ./target/${{ env.war_file }} @@ -50,7 +50,7 @@ jobs: - uses: actions/checkout@v5 - name: Download war artifact - uses: actions/download-artifact@v5 + uses: actions/download-artifact@v6 with: name: built-app path: ./ @@ -69,7 +69,7 @@ jobs: overwrite: true - name: Execute payara war deployment remotely - uses: appleboy/ssh-action@v1.2.2 + uses: appleboy/ssh-action@v1.2.3 env: INPUT_WAR_FILE: ${{ env.war_file }} with: diff --git a/.github/workflows/maven_cache_management.yml b/.github/workflows/maven_cache_management.yml index f266b804534..6bfb567c90b 100644 --- a/.github/workflows/maven_cache_management.yml +++ b/.github/workflows/maven_cache_management.yml @@ -36,7 +36,7 @@ jobs: - name: Determine Java version from Parent POM run: echo "JAVA_VERSION=$(grep '' modules/dataverse-parent/pom.xml | cut -f2 -d'>' | cut -f1 -d'<')" >> ${GITHUB_ENV} - name: Set up JDK ${{ env.JAVA_VERSION }} - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: ${{ env.JAVA_VERSION }} distribution: temurin diff --git a/.github/workflows/maven_unit_test.yml b/.github/workflows/maven_unit_test.yml index a416d5323f0..4de4a953a70 100644 --- a/.github/workflows/maven_unit_test.yml +++ b/.github/workflows/maven_unit_test.yml @@ -39,7 +39,7 @@ jobs: # Basic setup chores - uses: actions/checkout@v5 - name: Set up JDK ${{ matrix.jdk }} - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: ${{ matrix.jdk }} distribution: temurin @@ -62,7 +62,7 @@ jobs: # Upload the built war file. For download, it will be wrapped in a ZIP by GitHub. # See also https://github.com/actions/upload-artifact#zipped-artifact-downloads - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v5 with: name: dataverse-java${{ matrix.jdk }}.war path: target/dataverse*.war @@ -72,7 +72,7 @@ jobs: - run: | tar -cvf java-builddir.tar target tar -cvf java-m2-selection.tar ~/.m2/repository/io/gdcc/dataverse-* - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v5 with: name: java-artifacts path: | @@ -105,14 +105,14 @@ jobs: # Basic setup chores - uses: actions/checkout@v5 - name: Set up JDK ${{ matrix.jdk }} - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: ${{ matrix.jdk }} distribution: temurin cache: maven # Get the build output from the unit test job - - uses: actions/download-artifact@v5 + - uses: actions/download-artifact@v6 with: name: java-artifacts - run: | @@ -124,7 +124,7 @@ jobs: # Wrap up and send to coverage job - run: tar -cvf java-reportdir.tar target/site - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v5 with: name: java-reportdir path: java-reportdir.tar @@ -138,14 +138,14 @@ jobs: # TODO: As part of #10618 change to setup-maven custom action # Basic setup chores - uses: actions/checkout@v5 - - uses: actions/setup-java@v4 + - uses: actions/setup-java@v5 with: java-version: '17' distribution: temurin cache: maven # Get the build output from the integration test job - - uses: actions/download-artifact@v5 + - uses: actions/download-artifact@v6 with: name: java-reportdir - run: tar -xvf java-reportdir.tar diff --git a/.github/workflows/spi_release.yml b/.github/workflows/spi_release.yml index 9dc722c5992..378e6ff9b67 100644 --- a/.github/workflows/spi_release.yml +++ b/.github/workflows/spi_release.yml @@ -38,11 +38,11 @@ jobs: if: github.event_name == 'pull_request' && needs.check-secrets.outputs.available == 'true' steps: - uses: actions/checkout@v5 - - uses: actions/setup-java@v4 + - uses: actions/setup-java@v5 with: java-version: '17' distribution: 'adopt' - server-id: ossrh + server-id: central server-username: MAVEN_USERNAME server-password: MAVEN_PASSWORD - uses: actions/cache@v4 @@ -64,7 +64,7 @@ jobs: if: github.event_name == 'push' && needs.check-secrets.outputs.available == 'true' steps: - uses: actions/checkout@v5 - - uses: actions/setup-java@v4 + - uses: actions/setup-java@v5 with: java-version: '17' distribution: 'adopt' @@ -76,11 +76,11 @@ jobs: # Running setup-java again overwrites the settings.xml - IT'S MANDATORY TO DO THIS SECOND SETUP!!! - name: Set up Maven Central Repository - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: '17' distribution: 'adopt' - server-id: ossrh + server-id: central server-username: MAVEN_USERNAME server-password: MAVEN_PASSWORD gpg-private-key: ${{ secrets.DATAVERSEBOT_GPG_KEY }} diff --git a/conf/keycloak/builtin-users-spi/pom.xml b/conf/keycloak/builtin-users-spi/pom.xml index 36cf6548d01..2a730621f85 100644 --- a/conf/keycloak/builtin-users-spi/pom.xml +++ b/conf/keycloak/builtin-users-spi/pom.xml @@ -100,7 +100,7 @@ - 26.3.2 + 26.3.4 17 3.2.0 0.4 diff --git a/conf/mdc/counter_weekly.sh b/conf/mdc/counter_weekly.sh new file mode 100644 index 00000000000..67cb5df2af2 --- /dev/null +++ b/conf/mdc/counter_weekly.sh @@ -0,0 +1,92 @@ +#!/bin/sh +#counter_weekly.sh + +# This script iterates through all published Datasets in all Dataverses and calls the Make Data Count API to update their citations from DataCite +# Note: Requires curl and jq for parsing JSON responses form curl + +# A recursive method to process each Dataverse +processDV () { +echo "Processing Dataverse ID#: $1" + +#Call the Dataverse API to get the contents of the Dataverse (without credentials, this will only list published datasets and dataverses +DVCONTENTS=$(curl -s http://localhost:8080/api/dataverses/$1/contents) + +# Iterate over all datasets, pulling the value of their DOIs (as part of the persistentUrl) from the json returned +for subds in $(echo "${DVCONTENTS}" | jq -r '.data[] | select(.type == "dataset") | .persistentUrl'); do + +#The authority/identifier are preceded by a protocol/host, i.e. https://doi.org/ +DOI=`expr "$subds" : '.*:\/\/\doi\.org\/\(.*\)'` + +# Call the Dataverse API for this dataset and capture both the response and HTTP status code +HTTP_RESPONSE=$(curl -s -w "\n%{http_code}" -X POST "http://localhost:8080/api/admin/makeDataCount/:persistentId/updateCitationsForDataset?persistentId=doi:$DOI") + +# Extract the HTTP status code from the last line +HTTP_STATUS=$(echo "$HTTP_RESPONSE" | tail -n1) +# Extract the response body (everything except the last line) +RESPONSE_BODY=$(echo "$HTTP_RESPONSE" | sed '$d') + +# Check the HTTP status code and report accordingly +case $HTTP_STATUS in + 200) + # Successfully queued + # Extract status from the nested data object + STATUS=$(echo "$RESPONSE_BODY" | jq -r '.data.status') + + # Extract message from the nested data object + if echo "$RESPONSE_BODY" | jq -e '.data.message' > /dev/null 2>&1 && [ "$(echo "$RESPONSE_BODY" | jq -r '.data.message')" != "null" ]; then + MESSAGE=$(echo "$RESPONSE_BODY" | jq -r '.data.message') + echo "[SUCCESS] doi:$DOI - $STATUS: $MESSAGE" + else + # If message is missing or null, just show the status + echo "[SUCCESS] doi:$DOI - $STATUS: Citation update queued" + fi + ;; + 400) + # Bad request + if echo "$RESPONSE_BODY" | jq -e '.message' > /dev/null 2>&1; then + ERROR=$(echo "$RESPONSE_BODY" | jq -r '.message') + echo "[ERROR 400] doi:$DOI - Bad request: $ERROR" + else + echo "[ERROR 400] doi:$DOI - Bad request" + fi + ;; + 404) + # Not found + if echo "$RESPONSE_BODY" | jq -e '.message' > /dev/null 2>&1; then + ERROR=$(echo "$RESPONSE_BODY" | jq -r '.message') + echo "[ERROR 404] doi:$DOI - Not found: $ERROR" + else + echo "[ERROR 404] doi:$DOI - Not found" + fi + ;; + 503) + # Service unavailable (queue full) + if echo "$RESPONSE_BODY" | jq -e '.message' > /dev/null 2>&1; then + ERROR=$(echo "$RESPONSE_BODY" | jq -r '.message') + echo "[ERROR 503] doi:$DOI - Service unavailable: $ERROR" + elif echo "$RESPONSE_BODY" | jq -e '.data.message' > /dev/null 2>&1; then + ERROR=$(echo "$RESPONSE_BODY" | jq -r '.data.message') + echo "[ERROR 503] doi:$DOI - Service unavailable: $ERROR" + else + echo "[ERROR 503] doi:$DOI - Service unavailable: Queue is full" + fi + ;; + *) + # Other error + echo "[ERROR $HTTP_STATUS] doi:$DOI - Unexpected error" + echo "Response: $RESPONSE_BODY" + ;; +esac + +done + +# Now iterate over any child Dataverses and recursively process them +for subdv in $(echo "${DVCONTENTS}" | jq -r '.data[] | select(.type == "dataverse") | .id'); do +echo $subdv +processDV $subdv +done + +} + +# Call the function on the root dataverse to start processing +processDV 1 \ No newline at end of file diff --git a/doc/release-notes/6.9-release-notes.md b/doc/release-notes/6.9-release-notes.md new file mode 100644 index 00000000000..c49e4697096 --- /dev/null +++ b/doc/release-notes/6.9-release-notes.md @@ -0,0 +1,343 @@ +# Dataverse 6.9 + +Please note: To read these instructions in full, please go to https://github.com/IQSS/dataverse/releases/tag/v6.9 rather than the [list of releases](https://github.com/IQSS/dataverse/releases), which will cut them off. + +This release brings new features, enhancements, and bug fixes to Dataverse. Thank you to all of the community members who contributed code, suggestions, bug reports, and other assistance across the project! + +## Release Highlights + +Highlights for Dataverse 6.9 include: + +- Quickstart Guide +- Role Assignment History Tracking +- Scaling Dataverse with Data Size (Admin Guide) +- Storage Quotas on Individual Datasets +- Additional Licenses +- DataCite Scaling +- Support for COAR Notify Relationship Announcement +- Infrastructure upgrade: Payara +- New and improved APIs +- Bug fixes + +## Features Added + +### Quickstart Guide + +A new [Quickstart Guide](https://guides.dataverse.org/en/6.9/quickstart/index.html) has been added to help researchers understand what Dataverse is, how to publish datasets and collections, and how to find data. Buttons have been added to the top of the User Guide to direct people to these pages. Feedback on this new guide is welcome, such as through [Google Groups](https://groups.google.com/g/dataverse-community/c/TXOEFv9-0kY/m/RT5XV0QiAgAJ), [Zulip](https://dataverse.zulipchat.com/#narrow/channel/446770-docs/topic/Quickstart.20Guide.20'Publish.20a.20Dataset'/with/561675778), or the [Documentation Working Group](https://www.gdcc.io/working-groups/documentation.html). Special thanks to Dieuwertje Bloemen for writing most of the guide and to Vaida Plankytė for contributing. See https://guides.dataverse.org/en/6.9/quickstart and #11653. + +### Role Assignment History Tracking + +Dataverse can now track the history of role assignments, allowing administrators to see who assigned or revoked roles, when these actions occurred, and which roles were involved. This feature helps with auditing and understanding permission changes over time. An additional column called "Role Assignment History" has been added to the permission management page for collections. The information can also be downloaded via API in CSV and JSON formats. This feature is off by default but can be enabled with the "role-assignment-history" [feature flag](https://guides.dataverse.org/en/6.9/installation/config.html#feature-flags). See the [User Guide](https://guides.dataverse.org/en/6.9/user/dataverse-management.html#roles-permissions), [API Guide](https://guides.dataverse.org/en/6.9/api/native-api.html#dataverse-role-assignment-history), and #11612. + +### Scaling Dataverse with Data Size (Admin Guide) + +A new section, [Scaling Dataverse with Data Size](https://guides.dataverse.org/en/6.9/admin/big-data-administration.html), has been added to the Admin Guide. It is intended to help administrators configure Dataverse appropriately to handle larger amounts of data. See https://guides.dataverse.org/en/6.9/admin/big-data-administration.html and #11850. + +### Storage Quotas on Individual Datasets + +It is now possible to define storage quotas on individual datasets via API. The practical use case is for datasets in the top-level root collection. This feature does not address the use case of a user creating multiple datasets (#11529). A convenience API `/api/datasets/{id}/uploadlimits` has been added to show the "remaining storage" and "number of files" quotas, if present. See [the guides](https://guides.dataverse.org/en/6.9/api/native-api.html#storage-quotas-on-individual-datasets), #11987, and #11997. + +### Additional Licenses + +The following Open Data Commons licenses have been added: + +- Open Database License (ODbL) +- Open Data Commons Attribution License (ODC-By) +- Open Data Commons Public Domain Dedication and License (PDDL) + +The following software license has been added: + +- European Union Public License (EUPL) + +The following country-specific license has been added: + +- Open Government Licence (OGL UK) + +The licenses above are widely recognized and used in Europe and beyond to promote data and software sharing. See [the guides](https://guides.dataverse.org/en/6.9/installation/config.html#configuring-licenses) and #11522. + +### DataCite Scaling + +Dataverse now retries calls to DataCite when their server is overloaded or when the Dataverse server has hit the DataCite rate limit. + +It also introduces an option to only update DataCite metadata after checking to see if the current DataCite information is out of date. (This adds a request to get information from DataCite before any potential write of new information which will be more efficient when most DOIs have not changed but will result in an extra call to get info when a DOI has changed.) This functionality is off by default but can be enabled with the `only-update-datacite-when-needed` feature flag. + +Both of these can help when DataCite is being used heavily, e.g. creating and publishing datasets with many datafiles and using file DOIs, or doing bulk operations that involve DataCite with many datasets. + +See [the guides](https://guides.dataverse.org/en/6.9/installation/config.html#dataverse-feature-only-update-datacite-when-needed) and #11832 + +### Support for COAR Notify Relationship Announcement + +Dataverse now supports sending and receiving [Linked Data Notification ](https://www.w3.org/TR/ldn/) messages involved in the [COAR Notify Relationship Announcement Workflow](https://coar-notify.net/catalogue/workflows/repository-relationship-repository/). + +Dataverse can send messages to configured repositories announcing that a dataset has a related publication (as defined in the dataset metadata). This may be done automatically upon publication or triggered manually by a superuser. The receiving repository may do anything with the message, with the default expectation being that the repository will create a backlink from the publication to the dataset (assuming the publication exists in the repository, admins agree the link makes sense, etc.) + +Conversely, Dataverse can receive notices from other configured repositories announcing relationships between their publications and datasets. If the referenced dataset exists in the Dataverse instance, a notification will be sent to users who can publish the dataset, or, optionally, only superusers who can publish the dataset. They can then decide whether to create a backlink to the publication in the dataset metadata. + +See [the guides](https://guides.dataverse.org/en/6.9/developers/workflows.html#coarnotifyrelationshipannouncement), #8914, and #10490. (Earlier releases of Dataverse had experimental support in this area that was based on message formats defined prior to finalization of the COAR Notify specification for relationship announcements.) + +### Other Features Added + +- When creating a dataset, the "Host Dataverse" field is not shown when the user can add datasets to only one collection. See #11865. +- In the UI for granting file access, restricted files in draft will now show "Draft/Unpublished". See #7618 and #11794. +- This version of Dataverse includes extensions of the Dataverse [external vocabulary mechanism](https://guides.dataverse.org/en/6.9/admin/metadatacustomization.html#using-external-vocabulary-services) that improve Dataverse's ability to include metadata about vocabulary terms and external identifiers such as ORCID and ROR in its metadata exports. More information on how to configure external vocabulary scripts to use this functionality can be found in [docs/readme.md](https://github.com/gdcc/dataverse-external-vocab-support/blob/main/docs/readme.md) and in the examples in the https://github.com/gdcc/dataverse-external-vocab-support repository. See #11793. +- The [external vocabulary mechanism](https://github.com/gdcc/dataverse-external-vocab-support) now supports assigning metadatablock dataset field types of fieldType textbox (multiline inputs) as managed fields. See #11954. +- It is now possible to configure all database settings at once with a JSON file. See [the guides](https://guides.dataverse.org/en/6.9/api/native-api.html#configure-all-database-settings), #11639, and #11654. +- CORS support has been modernized so browser integrations (previewers, external tools, JS clients) work correctly with multiple origins and proper caching. See [the guides](https://guides.dataverse.org/en/6.9/installation/config.html#cors-settings), #11744, and #11745. +- Integration with Local Contexts has been updated to support the change in their API regarding how DOIs entered as "Optional Project Information" are represented. See [the guides](https://guides.dataverse.org/en/6.9/installation/localcontexts.html) and #11904. +- This release adds database indexes on GuestbookResponse and DatasetMetrics, speeding up Dataset deletes. It also adds a constraint preventing null VersionState, as a matter of good housekeeping practice. See #11828 and #11898. +- Permission reindexing, which occurs, e.g., after a user has been granted a role on a collection, has been made faster and less memory intensive in this release. See #11822. +- Performance has been improved when retrieving citations from DataCite. A related setting called `dataverse.api.mdc.min-delay-ms` has been added. See [the guides](https://guides.dataverse.org/en/6.9/installation/config.html#dataverse-api-mdc-min-delay-ms), #11777, and #11781. +- Processing of comma-separated lists in settings has been centralized and now ignores spaces around commas. See [the guides](https://guides.dataverse.org/en/6.9/installation/config.html#comma-separated-configuration-values) and #11745. + +## Bugs Fixed + +- Editing a controlled vocabulary field (i.e. one with values specified in the field's metadatablock) that only allows a single selection would also update the value in the prior published version if (and only if) the edit was made starting from the published version (versus an existing draft). This is now fixed. The bug appears to be 11+ years old and previously unreported. As the value in the database was overwritten, there is no simple way to detect if or when this occurred without looking at backups or archival file copies. See #11990 and #11991. +- In prior versions of Dataverse, publishing a dataset via the superuser-only update-current-version option would not set the current curation status (if enabled/used) to none/empty and, in v6.7, would not maintain the curation status history. These issues are now resolved and the update-current-version option works the same as normal publication of a new version with regard to curation status. See #11783 and #11784. +- This release fixes problems with guestbook questions being displayed at download when files are selected from the dataset files table when guestbook-at-request is enabled and not displaying when they should when access is requested from the file page. See #11800, #11808, and #11835. +- The optional Croissant exporter has been updated to 0.1.6 to prevent variable names, variable descriptions, and variable types from being exposed for restricted files. See https://github.com/gdcc/exporter-croissant/pull/20 and #11752. +- Manage Gustbooks page was optimized to load much faster for collections with large numbers of downloads recorded. + +## API Updates + +- The Settings API no longer accepts keys (in the sense of key/value pairs) that are not defined in the code. See "Database Settings Cleanup" in these release notes for details, #11639 and #11654. +- A new API endpoint has been added for retrieving a list of collections to which a given dataset or collection can be linked. See [the guides](https://guides.dataverse.org/en/6.9/api/native-api.html#list-dataverse-collections-to-which-a-given-dataset-or-dataverse-collection-may-be-linked), #11710, and #11741. +- A new API endpoint has been added to manage dataset licenses. See [the guides](https://guides.dataverse.org/en/6.9/api/native-api.html#update-dataset-license), #11771, #11815, and #11958. +- A new API endpoint has been added to manage dataset terms of access for restricted files. See [the guides](https://guides.dataverse.org/en/6.9/api/native-api.html#update-dataset-terms-of-access), #11772, and #11893. +- A new API endpoint has been added for getting and setting the metadata language of a collection. See [the guides](https://guides.dataverse.org/en/6.9/api/native-api.html#list-the-allowed-metadata-languages-of-a-dataverse-collection), 11856, and #11857. +- The `/api/dataverses/{identifier}/templates` endpoint previously required editDataverse permissions to retrieve the list of dataverse templates. This has been corrected. The endpoint now addDataset permissions instead. See #11796 and #11801. +- The endpoint `/api/datasets/{id}/versions/compareSummary` was previously returning an incorrect count for +the `changedFileMetaData` field. The logic for calculating this count has been fixed to accurately reflect the total number of file metadata changes across all files in the dataset version. See #11921 and #11944. +- The "File Version Differences" and "Dataset Version Summaries" API endpoints have been improved with pagination support (with `limit` and `offset` parameters), performance improvements, and a bug fix. See [the guides](https://guides.dataverse.org/en/6.9/api/native-api.html#get-versions-of-a-dataset-with-summary-of-changes), #11561, #11855 and #11859. +The Notifications API has been improved (with `onlyUnread`, `limit`, and `offset` parameters) but also has some breaking changes. See [the guides](https://guides.dataverse.org/en/6.9/api/native-api.html#notifications), #11804, #11851, and #11854. +- The storage API driver endpoint now returns a JSON object with the driver's "name", "type" and "label", and booleans indicating whether the driver has "directUpload", "directDownload", and/or "uploadOutOfBand" enabled. This change also affects the /api/admin/dataverse/{dataverse-alias}/storageDriver api call. In addition, this call now supports an optional ?getEffective=true to find the effective storageDriver (the driver that will be used for new datasets in the collection), See [the guides](https://guides.dataverse.org/en/6.9/admin/dataverses-datasets.html#configure-a-dataverse-collection-to-store-all-new-files-in-a-specific-file-store), #11695, and #11716. + +## Security Updates + +This release contains important security updates such as an upgrade to Payara (#11827) and how CORS is handled (#11745). If you are not receiving security notices, please sign up by following [the steps](https://guides.dataverse.org/en/latest/installation/config.html#ongoing-security-of-your-installation) in the guides. + +## Backward Incompatible Changes + +Generally speaking, see the [API Changelog](https://guides.dataverse.org/en/latest/api/changelog.html) for a list of backward-incompatible API changes. + +- This release of Dataverse requires an upgrade to Payara. Please be aware that you need to upgrade Payara and Dataverse at the same time because older versions of Dataverse are not compatible with newer versions of Payara. See upgrade instructions for details. +- CORS is no longer enabled by default. See the upgrade instructions for details as well as #11745. +- See the "Database Settings Cleanup" section below. In the past, the settings API would accept any key and value. This is no longer the case because validation has been added. Also the way to set per-format size limits for tabular ingest has changed. JSON input is now used. See #11639 and #11654. +- The Update Collection Input Levels API endpoint no longer deletes the custom input levels previously modified for the given collection. In order to update a previously modified custom input level, it must be included in the JSON provided to the api. See [the guides](https://guides.dataverse.org/en/6.9/api/native-api.html#update-collection-input-levels), #11387, and #11748. +- For `/api/admin/dataverse/{dataverse-alias}/storageDriver` and `/api/datasets/{identifier}/storageDriver` the driver name is no longer returned in data.message. Instead, it is returned as data.name (along with other information about the storageDriver). See [the guides](https://guides.dataverse.org/en/6.9/api/native-api.html#configure-a-dataset-to-store-all-new-files-in-a-specific-file-store), #11695, and #11664. +- The POST /api/admin/makeDataCount/{id}/updateCitationsForDataset processing is now asynchronous and the response no longer includes the number of citations. The response can be OK if the request is queued or 503 if the queue is full (default queue size is 1000). See #11777 and #11781. + +## End-Of-Life (EOL) Announcements + +### PostgreSQL 13 Reached EOL on 13 November 2025 + +We mentioned this in the Dataverse [6.6](https://github.com/IQSS/dataverse/releases/tag/v6.6) and [6.8](https://github.com/IQSS/dataverse/releases/tag/v6.8) release notes, but as a reminder, according to https://www.postgresql.org/support/versioning/ PostgreSQL 13 reached EOL on 13 November 2025. As mentioned in the [Installation Guide](https://guides.dataverse.org/en/6.9/installation/prerequisites.html#postgresql), we recommend running PostgreSQL 16 since it is the version we test with in our continuous integration ([since](https://github.com/gdcc/dataverse-ansible/commit/8ebbd84ad2cf3903b8f995f0d34578250f4223ff) February 2025). The [Dataverse 5.4 release notes](https://github.com/IQSS/dataverse/releases/tag/v5.4) explained the upgrade process from 9 to 13 (e.g. pg_dumpall, etc.) and the steps will be similar. If you have any problems, please feel free to reach out (see "getting help" in these release notes). + +### Payara 6 Reaching EOL with 6.2025.11 + +As explained in #12020 and https://payara.fish/blog/payara-6-community-end-of-life/ Payara 6.2025.11 is the last release in the 6.x series. A pull request at #12043 to upgrade to Payara 7 has already been made but won't make it into this release of Dataverse. You are welcome to try it out and give feedback. + +## Developer Updates + +- The ExportDataProvider framework in the dataverse-spi package has been extended, adding some extra options for developers of metadata exporter plugins. See [the guides](https://guides.dataverse.org/en/6.9/developers/metadataexport.html#building-an-exporter) #11766, and #11767. + +## Database Settings Cleanup + +With this release, we remove some legacy quirks around Database settings and provide better Admin API endpoints for managing settings. + +Most important changes: + +1. The setting `BuiltinUsers.KEY` was renamed to `:BuiltinUsersKey` to align with our naming pattern. +2. The setting `WorkflowsAdmin#IP_WHITELIST_KEY` was renamed to `:WorkflowsAdminIpWhitelist` to align with our naming pattern. +3. The setting `:TabularIngestSizeLimit` no longer uses suffixes for formats and becomes a JSON-based setting instead. See [the guides](https://guides.dataverse.org/en/6.9/installation/config.html#tabularingestsizelimit) for details. +4. If any of the settings above are set, they will be migrated to their new form automatically for you when the war file is deployed (Flyway migration). +5. You can no longer (accidentally) create or use arbitrary setting names or languages. + All Admin API endpoints for settings now validate setting names and languages for existence and compliance. + +As an administrator of a Dataverse instance, you can now make use of enhanced Bulk Operations on the Settings Admin API: + +1. Retrieving all settings as JSON via `GET /api/admin/settings` supports localized options now, too. +2. You can replace all existing settings in an idempotent way sending JSON to `PUT /api/admin/settings`. + This will create, update and remove settings as necessary in one atomic operation. + The new endpoint is especially useful to admins using GitOps or other automations. + It allows control over all Database Settings from a single source without risking an undefined state. + +Note: Despite the validation of setting names and languages, the content of any database setting is still not being validated when using the Settings Admin API! + +### Important Considerations During Upgrade Of Your Installation + +1. Running a customized fork? Make sure to add any custom settings to the SettingsServiceBean.Key enum before deploying! +2. Any database settings not contained in the `SettingServiceBean.Key` will be removed from your database during each deployment cycle. +3. As always when upgrading, make sure to backup your database beforehand! + You can also use the existing API endpoint `/api/admin/settings` to retrieve all settings as JSONish data for a quick backup before upgrading. + +See also #11639 and #11654. + +## New Settings + +### New JVM Options (MicroProfile Config Settings) + +- dataverse.api.mdc.min-delay-ms +- dataverse.coar-notify.relationship-announcement.notify-superusers-only +- dataverse.feature.only-update-datacite-when-needed +- dataverse.feature.role-assignment-history +- dataverse.ldn.allowed-hosts + +### New Database Settings + +- :COARNotifyRelationshipAnnouncementTargets +- :COARNotifyRelationshipAnnouncementTriggerFields + +### Updated Database Settings + +The following database settings were added to the official list within the code (to remain valid with the settings cleanup mentioned above): + +- `:BagGeneratorThreads` +- `:BagItHandlerEnabled` +- `:BagItLocalPath` +- `:BagValidatorJobPoolSize` +- `:BagValidatorJobWaitInterval` +- `:BagValidatorMaxErrors` +- `:BuiltinUsersKey` - formerly `BuiltinUsers.KEY` +- `:CreateDataFilesMaxErrorsToDisplay` +- `:DRSArchiverConfig` - a Harvard-specific setting +- `:DuraCloudContext` +- `:DuraCloudHost` +- `:DuraCloudPort` +- `:FileCategories` +- `:GoogleCloudBucket` +- `:GoogleCloudProject` +- `:LDNAnnounceRequiredFields` +- `:LDNTarget` +- `:WorkflowsAdminIpWhitelist` - formerly `WorkflowsAdmin#IP_WHITELIST_KEY` +- `:PrePublishDatasetWorkflowId` - formerly `WorkflowServiceBean.WorkflowId:PrePublishDataset` +- `:PostPublishDatasetWorkflowId` - formerly `WorkflowServiceBean.WorkflowId:PostPublishDataset` + +## Deleted Settings + +- In Dataverse 6.7 (#11454) the `:AllowCors` setting was deprecated in favor of a new `dataverse.cors.origin` setting and now, in Dataverse 6.9, the `:AllowCors` setting has been removed (#11745). See also the step on CORS in the upgrade instructions below. +- `BuiltinUsers.KEY` - now `:BuiltinUsersKey` +- `WorkflowsAdmin#IP_WHITELIST_KEY` - now `:WorkflowsAdminIpWhitelist` +- `WorkflowServiceBean.WorkflowId:PrePublishDataset` - now `:PrePublishDatasetWorkflowId` +- `WorkflowServiceBean.WorkflowId:PostPublishDataset` - now `:PostPublishDatasetWorkflowId` + +## Complete List of Changes + +For the complete list of code changes in this release, see the [6.9 milestone](https://github.com/IQSS/dataverse/issues?q=milestone%3A6.9+is%3Aclosed) in GitHub. + +## Getting Help + +For help with upgrading, installing, or general questions please see [getting help](https://guides.dataverse.org/en/latest/installation/intro.html#getting-help) in the Installation Guide. + +## Installation + +If this is a new installation, please follow our [Installation Guide](https://guides.dataverse.org/en/latest/installation/). Please don't be shy about [asking for help](https://guides.dataverse.org/en/latest/installation/intro.html#getting-help) if you need it! + +Once you are in production, we would be delighted to update our [map of Dataverse installations around the world](https://dataverse.org/installations) to include yours! Please [create an issue](https://github.com/IQSS/dataverse-installations/issues) or email us at support@dataverse.org to join the club! + +You are also very welcome to join the [Global Dataverse Community Consortium](https://www.gdcc.io/) (GDCC). + +## Upgrade Instructions + +NOTE: These instructions include an upgrade to Payara 6.2025.10. Do not attempt to run Dataverse 6.8 or lower on Payara 6.2025.10. It won't work. To see the changes that had to be made to the the Dataverse code to make it compatible with Payara 6.2025.10 see #11827. + +Upgrading requires a maintenance window and downtime. Please plan accordingly, create backups of your database, etc. + +When doing backups, make sure your settings are backed up. See "Database Settings Cleanup" above for details. + +0\. These instructions assume that you are upgrading from the immediate previous version. That is to say, you've already upgraded through all the 6.x releases and are now running Dataverse 6.8. See [tags on GitHub](https://github.com/IQSS/dataverse/tags) for a list of versions. If you are running an earlier version, the only supported way to upgrade is to progress through the upgrades to all the releases in between before attempting the upgrade to this version. + +If you are running Payara as a non-root user (and you should be!), **remember not to execute the commands below as root**. By default, Payara runs as the `dataverse` user. In the commands below, we use sudo to run the commands as a non-root user. + +Also, we assume that Payara 6 is installed in `/usr/local/payara6`. If not, adjust as needed. + +```shell +export PAYARA=/usr/local/payara6 +``` + +(or `setenv PAYARA /usr/local/payara6` if you are using a `csh`-like shell) + +1\. List deployed applications + +```shell +$PAYARA/bin/asadmin list-applications +``` + +2\. Undeploy the previous version (should match "list-applications" above) + +```shell +$PAYARA/bin/asadmin undeploy dataverse-6.8 +``` + +3\. Stop Payara + +```shell +sudo service payara stop +``` + +4\. Upgrade to Payara 6.2025.10 + +The recommended Payara version has been updated to Payara 6.2025.10. See #11827. + +As a reminder, Payara 6.2025.10 cannot be used with earlier versions of Dataverse, e.g. v6.8. + +The steps below reuse your existing domain directory with the new distribution of Payara. You may also want to review the Payara upgrade instructions as it could be helpful during any troubleshooting: +[Payara Release Notes](https://docs.payara.fish/community/docs/6.2025.10/Release%20Notes/Release%20Notes%206.2025.10.html). +We also recommend you ensure you followed all update instructions from the past releases regarding Payara. +(The most recent Payara update was for [Dataverse 6.7](https://github.com/IQSS/dataverse/releases/tag/v6.7).) + +Move the current Payara directory out of the way: + +```shell +mv $PAYARA $PAYARA.6.2025.3 +``` + +Download the new Payara version 6.2025.10 (from https://www.payara.fish/downloads/payara-platform-community-edition/ or https://nexus.payara.fish/repository/payara-community/fish/payara/distributions/payara/6.2025.10/payara-6.2025.10.zip), and unzip it in its place: + +```shell +cd /usr/local +unzip payara-6.2025.10.zip +``` + +Replace the brand new `payara6/glassfish/domains/domain1` with your old, preserved domain1: + +```shell +mv payara6/glassfish/domains/domain1 payara6/glassfish/domains/domain1_DIST +mv payara6.6.2025.3/glassfish/domains/domain1 payara6/glassfish/domains/ +``` + +Once your old, preserved domain1 directory is in place, copy the *.p12 files from distributed domain1/config directory into it, at least if they are different. These files include base certs. Over time, using the old cert files can cause SSL errors. If the domain.xml file is old enough, it may also reference *.jks files instead of the *.p12 ones. If so, update domain.xml so that these lines reference *.p12 files instead of *.jks files. + +```shell +cp payara6/glassfish/domains/domain1_DIST/*.p12 payara6/glassfish/domains/domain1 +``` + +5\. Download and deploy this version + +```shell +wget https://github.com/IQSS/dataverse/releases/download/v6.9/dataverse-6.9.war +$PAYARA/bin/asadmin deploy dataverse-6.9.war +``` + +Please note that, depending on your database, the initial deployment of this version may take a little longer than usual. This is because new database indexes will need to be created in real time. The time it should take is not even a function of the overall size of your database, but specifically the `GuestbookResponse` table. (In other words, not a function of how much content you have in the repository, but how *popular* it is). + +6\. For installations with internationalization or text customizations: + +Please remember to update translations via [Dataverse language packs](https://github.com/GlobalDataverseCommunityConsortium/dataverse-language-packs). + +If you have text customizations you can get the latest English files from . + +7\. Enable or re-enable CORS, if desired + +As of Dataverse 6.8 CORS was enabled by default but this is no longer the case. See #11745. + +If you are relying on CORS, e.g. using file previewers hosted at gdcc.github.io, and did not configure `dataverse.cors.origin` as recommended in the 6.7 release notes, you should do so now. With updates in 6.9, it is also possible to limit CORS support to specific internet hosts. See [the guides](https://guides.dataverse.org/en/6.9/installation/config.html#cors-settings) for details. + +8\. Update Croissant exporter, if enabled, and reexport metadata + +If you have enabled the Croissant dataset metadata exporter, you should upgrade to version 0.1.6. + +- Stop Payara. +- Delete the old Croissant exporter jar file. It will be located in the directory defined by the `dataverse.spi.exporters.directory` setting. +- Download the updated Croissant jar from https://repo1.maven.org/maven2/io/gdcc/export/croissant/ and place it in the same directory. +- Restart Payara. +- Run reExportAll. diff --git a/doc/sphinx-guides/source/admin/big-data-administration.rst b/doc/sphinx-guides/source/admin/big-data-administration.rst new file mode 100644 index 00000000000..b3c7e79c382 --- /dev/null +++ b/doc/sphinx-guides/source/admin/big-data-administration.rst @@ -0,0 +1,321 @@ +Scaling Dataverse with Data Size +================================ + +This section is intended to help administrators configure Dataverse appropriately to handle larger amounts of data. + +Scaling is a complex subject: there are many options available in Dataverse that can improve performance with larger scale data, some of which +work differently than Dataverse's default configuration, potentially requiring user education, and some which can require additional expertise to manage. + +In general, there are three dimensions in which Dataverse can scale: + +1. **Storage size of individual files and aggregate storage size** +2. **Number of files per dataset** +3. **Number of datasets** + + +.. contents:: |toctitle| + :local: + +.. _choose-store: + +Storage: Choosing the Right Store +--------------------------------- + +The main issues for handling larger files and larger aggregate data size relate to the performance of the storage used and how involved the Dataverse server is in the data transfer. +With appropriate configuration, Dataverse can support file sizes and aggregate dataset sizes into the terabyte scale and beyond. + +The primary choice in Dataverse related to storage is which types of "store" (also called "storage driver") to use: + +.. _file-stores: + +File Stores +~~~~~~~~~~~ + +The default storage option in Dataverse uses the local file system. When files are transferred to Dataverse, they are first stored in a +temporary location on the Dataverse server. Any zip files uploaded are unzipped to create multiple individual file entries. Once an upload is completed, +Dataverse copies the files to permanent storage. Dataverse also takes advantage of the file being local to inspect its bytes to determine its +MIME type, and, for tabular data, to ":doc:`ingest `" it - extracting metadata about the variables used in the file and creating a tab-separated values (TSV) +version of the file. + +Benefits: + +- This option requires no external services and can potentially handle files into the gigabyte (GB) size range. For smaller institutions, + and in disciplines where datasets do not have more than a few hundred files and files are not too large, this can be the simplest option. +- Unzipping of zip archives can be simpler for users than having to upload many individual files and was, at one time, the only way to + preserve file path names when uploading. In addition, some of the unzipped files might be in a format that can be previewed or otherwise acted upon - see :ref:`file-handling` in the User Guide. + +Challenges: In general, file storage is not a good option for larger data sizes - both in terms of file size and number of files. Contributing factors include: + +- Because temporary storage is used, transfers will temporarily use several times as much space as the final transfer. Unzipping also increases the final storage size of a dataset. +- Because all uploads use the same temporary storage, temporary storage must be large enough to handle multiple users uploading data. +- Each file is uploaded as a single HTTP request, which can cause long transfer times which, in turn, can trigger timeout errors in Dataverse or any proxy or load balancer in front of Dataverse. +- Uploading many files at once can trigger any rate limiter in front of the Dataverse server (i.e. used to throttle AI bots) resulting in failures. +- Because transfers (both uploads and downloads) are handled by the Dataverse server, they add to server processing load which can affect overall performance. +- Cost: local file storage must be provisioned in advance based on anticipated demand. It can involve up-front costs (for a local disk), or, when procured from a + cloud provider, is likely to be more expensive than object storage from that provider (see below). + +.. _s3-stores: + +S3 Stores: Object Storage via S3 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A more scalable option for storage is to use an object store with transfers managed using the Simple Storage Service (S3) protocol. S3-compatible storage can +easily be bought (rented) from major cloud providers, but may also be available from institutional clouds. It is also possible to run open source software to provide +S3 storage over a local file system (making it possible to enjoy the advantages discussed below while still leveraging local file storage). + +While S3 Stores can be configured to handle uploads and downloads as with file storage (with zip files being unzipped, but having many of the same challenges in terms of temporary storage and server load as discussed above) +they can also be configured to use "direct" upload and download. In this configuration, the actual transfer of file bytes is between the user's local machine and +the S3 store. In this configuration, files are never stored on the Dataverse server. Dataverse does not attempt to unzip zip files, and they are stored as a single file in the dataset. + +Benefits: S3 offers several advantages over file storage: + +- Scalability: S3 is designed to handle large amounts of data. It can handle individual files up to several TB in size. + Because S3 supports breaking files into pieces, Dataverse can transfer a file in pieces (several in parallel, potentially thousands of pieces per file) making transfers faster + and more robust (a failure requires only resending the failed piece). It may also be the case that users will have a faster network connection to the S3 store + (e.g. in a commercial cloud or High Performance Computing center) than they do to the Dataverse server, reducing transfer time. +- High Availability: S3 provides redundancy beyond what is available with a local file system (valuable for preservation, potentially reducing the need to perform data integrity checks). + +Challenges: + +- One additional step that is required to enable direct uploads via a Dataverse installation and for direct download to work with previewers and direct upload to work with DVWebloader (:ref:`folder-upload`) is to allow cross site (CORS) requests on your S3 store. +- Cost: S3 offers a pricing model that allows you to pay for the storage and transfer of data based on current usage (versus long term demand) but commercial + providers charge more per TB than the equivalent cost of a local disk (though commercial S3 storage is cheaper than commercial file storage). + There can also be egress and other charges. Overall, S3 storage is generally more expensive than local file storage but cheaper than cloud file storage. + Running a local S3 storage or leveraging an institutional service can further reduce costs. +- Direct upload via S3 is a :doc:`multi-step process `: Dataverse provides URLs for the uploads, the user's browser or other app uses the URLs to transfer files to the S3 store, + possibly in many pieces per file, and finally, Dataverse is told that one or more files are in place and should be added to the dataset. If the last step fails, or if + all parts of a file cannot be transferred, orphaned files or parts of files can be left on S3. These files are not accessible via Dataverse but do use space (for which there is a monetary cost) + until they are deleted. There is currently no automated clean-up mechanism. + +Other Considerations +^^^^^^^^^^^^^^^^^^^^ + +- S3 Storage without direct upload/download provides minimal benefits with Dataverse as files still pass through the server, files are still uploaded as a single HTTP/HTTPS stream, and temporary storage is still used. +- While not having files unzipped can be confusing to users who are used to it from using Dataverse with file storage, there are ways to minimize the impact. + For example, Dataverse can be configured to use a "Zip File Previewer" that allows users to see the contents of a zip file and even download individual files from within it (see :ref:`compressed-files`). + For users who still want their data stored as individual files with their relative folder paths, Dataverse can be configured with ":ref:`DVWebloader `" which allows users to select an entire folder tree of files and + upload them, with their relative paths intact, to Dataverse. (DVWebloader can only be used with S3/direct upload, but it is much more efficient with many files than using the + standard upload interface in Dataverse (which also does not retain path information)). +- Several features that involve Dataverse accessing files' contents, including unzipping zip files, are disabled when S3 direct upload is enabled. See :ref:`s3-direct-upload-features-disabled`. + +- Using direct upload stops Dataverse from inspecting the file bytes to determine the MIME type (with one exception - Stata files). Dataverse will still look at the file name and extension to determine the MIME type. +- To perform "ingest" processing (see :doc:`/user/tabulardataingest/index`), Dataverse currently has to copy the file to local storage, negating the benefit of sending data directly to S3. To manage larger files, one can set a per-store + ingest size limit (which can be 0 bytes) to stop ingest or limit it to smaller files (see :ref:`list-of-s3-storage-options`). +- Dataverse's mechanism for downloading a whole dataset or multiple selected files involves zipping those files together. Even When using S3 with direct upload/download, + the file bytes are transferred to the Dataverse server as part of the zipping process. There are ways to reduce the performance impact of this: + + - There is a :ref:`Standalone "Zipper" Service Tool ` that can be run separate from Dataverse to handle the zipping process. + - Dataverse has a :ref:`:ZipDownloadLimit` that can be used to limit the amount of data that can be zipped. If a dataset is larger than this limit, Dataverse will only add some of the files to the zip and list others in the included manifest file. + - There are tools such as the Dataverse Dataset Downloader (https://github.com/gdcc/dataverse-recipes/tree/main/shell/download#dataverse-dataset-downloader) that can be used to download all of the files individually. This avoids sending any of the files through the Dataverse server when S3 direct download is enabled. + +- Dataverse leverages S3 features that are not implemented by all servers and has several configuration options geared towards handling variations between servers - see :ref:`s3-compatible`. Site admins should be sure to test with their preferred S3 implementation (and consider adding to the list of working S3 implementations). +- The part-size used when directly transferring files to S3 is configurable (at AWS, from 5 MiB to 5GiB). The default in Dataverse is 1 GiB (1024^3 bytes). If the primary use case is with smaller files than that, decreasing the part size may improve upload speeds. + +.. _remote-stores: + +Remote Stores +~~~~~~~~~~~~~ + +Note: Remote Storage is still experimental: feedback is welcome! See :ref:`support`. + +For very large, and/or very sensitive data, it may not make sense to transfer or copy files to Dataverse at all. +The ``remote`` store type in the Dataverse software supports these use cases. +It allows Dataverse to store a URL reference to the file rather than transferring the file bytes to a store managed directly by Dataverse. +In the most basic configuration a site administrator configures the base URL for the store, e.g. "https://thirdpartystorage.edu/long-term-storage/" +and users can then create files referencing any URL starting with that base, e.g. "https://thirdpartystorage.edu/long-term-storage/my_project_dir/my_file.txt". + +If the remote site is a public web server, the remote store in Dataverse should be configured to be "public" which will disable the ability to restrict +or embargo files (as they are public on the remote site and Dataverse cannot block access.) Conversely, Dataverse can be configured to sign requests to the +remote server which the remote server can then, if it is capable of validating them, use to reject requests not approved by Dataverse. In this configuration, +users can restrict and embargo files and Dataverse and the remote server will cooperate to manage access control. Another alternative, with a more advanced +remote store, would be, instead of using URLs that directly enable download of the file, to use URLs that point to a landing page at the remote server that +may require the user to login and go through some approval process before being able to access the file. + +Dataverse considers remote storage to be read-only, or, in cases where the remote service does not provide a way for Dataverse to download the file bytes +(due to access control or because the URL refers to a landing page), inaccessible. Depending on whether Dataverse can access the bytes of the file, +functionality such as ingest and integrity checking may or may not be possible. If the file bytes are not accessible, the remote store in Dataverse should be +configured to disable operations that attempt to access the file (see the files-not-accessible-by-dataverse in :ref:`trusted-remote-storage`). +Regardless of whether the remote files can be read, local storage of other datafiles and auxiliary files in the same dataset is possible. Support for such files is handled by configuring a "base" store with the remote store that is used for these purposes. (This means that while +files added as remote remain on the remote store, other files in the dataset, and potentially thumbnails and the ingested TSV format of remote files would be managed by Dataverse +in the base store. If ingest is not desired, the ingest size limit for the store can be set to 0 bytes). + + +Benefits: + +- This is a relatively simple way to off-load the management of large and/or sensitive data files to other organizations while still providing Dataverse's overall capabilities for dataset curation and publication to users. +- If the store has been configured with a remote-store-name or remote-store-url, the dataset file table will include this information for remote files. These provide a visual indicator that the files are not managed directly by Dataverse and are stored/managed by a remote trusted store. + +Challenges: + +- As Dataverse is relying on the remote service to maintain the integrity and availability of the files, it is likely that the Dataverse site admin will want to have a formal agreement with the remote service + operator about their policies. +- Currently, remote files can only be added via the API. (This may be addressed in future versions). +- Remote files can only be added after the dataset is created in the UI (and therefore has an id and PID for use with the API). However, the UI will still allow upload of files to the base store (at dataset creation and when editing), which could be confusing. +- Site admins need to consider carefully how to configure file size limits, ingest size limits, etc. on the remote store and its base store, and whether the remote store is public-only, and whether files there can be read by Dataverse to assure the + requirements of a specific use case(s) are addressed. +- The current remote store implementation will not prevent you from providing a relative URL that results in a 404 when resolved (i.e. if you make a typo). You should check to make sure the file exists at the location you specify - by trying to download in Dataverse, by checking to see that Dataverse was able to get the file size (which it does with a HEAD call to that location), or just manually trying the URL in your browser. +- For large files, direct-download should always be used with a remote store. (Otherwise the Dataverse will be involved in the download.) +- When multiple files are selected for download, Dataverse will try to include remote files in the zip file being created (up to the max zip size limit) which is inefficient, and will not be able to include remote files that are inaccessible (possibly confusing). + +.. _globus-stores: + +Globus Stores: Globus Transfer Between Large File/Tape Archives +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Note: Globus Transfer is still experimental: feedback is welcome! See :ref:`support`. + +`Globus `_ provides file transfer service that is widely used for the world's largest datasets (in terms of both file size and number of files). It provides: + +- Robust file transfer capable of handling delays (e.g. due to the time it takes to mount tapes) and restarting after network or endpoint failures +- Rapid parallel file transfers, potentially between clusters of computers on both ends +- Third-party transfer, which enables a user working with their desktop browser to initiate transfers of files between remote endpoints, i.e. sending files on a local high-performance computing cluster to a Dataverse endpoint or vice versa. + +Dataverse can be configured to support Globus transfers in multiple ways: + +- A Dataverse-managed Globus File Endpoint: Dataverse controls user access to the endpoint, access is only via Globus +- A Dataverse-managed Globus S3 Endpoint: Dataverse controls user access to the endpoint, access is available via S3 and via Globus +- A Globus Endpoint treated as Remote Storage: Dataverse references files on a Globus endpoint managed by a third party + +Benefits: + +- Globus scales to higher data volumes than any other option. Users working with large data are often familiar with Globus and are interested in transferring data to/from computational clusters rather than their local machine. +- Globus transfers can be initiated by choosing the Globus option in the dataset upload panel. Analogously, "Globus Transfer" is one of the download options in the "Access Dataset" menu. +- For the non-S3 options, Dataverse supports having a base store (e.g. a local file system or an S3-based store), which can be used internally by Dataverse (e.g. for thumbnails, etc.) and can allow users to upload smaller files (e.g. READMEs, documentation) that might not be suited to a given Globus endpoint (e.g. a tape store). + +Challenges: + +- Globus is complex to manage and Dataverse installations will need to develop Globus expertise or partner with another organization (i.e. an institutional high-performance computing center) to manage Globus endpoints. +- For users not familiar with Globus, managing transfers can be confusing. For the non-S3 options, users cannot just download files - they must have access to a destination Globus endpoint and have a Globus account. Globus does provide free accounts and a free "Globus Personal Connect" service which installed on any machine to allow transfers to/from it. +- Globus transfers are not enabled at dataset-creation time. Once the draft version is created, users can initiate Globus transfers to upload files from remote endpoints. +- For Dataverse-managed endpoints, a community-developed `dataverse-globus `_ app must be installed and configured in the Dataverse instance. + This app manages granting and revoking access for users to upload/download files from Dataverse and handles the translation between Dataverse's internal file naming/organization to that seen by the user. +- Users familiar with Globus sometimes expect to be able to find the Dataverse endpoint in Globus' online service and download files from there. Due to the fact that Dataverse is managing permissions and handling file naming, this doesn't work. +- Due to differences between Dataverse's and Globus's access control models, Dataverse cannot enforce per-file access restrictions - restriction can only be done today at the level of providing access to all files in a dataset. + Globus stores can be defined as public to disable Dataverse's ability to restrict and embargo files in that store. If the store is configured to support restriction and embargo, + Dataverse and its Dataverse-Globus app will limit users to downloading only the files they have been granted access to, but a technically knowledgeable user could access other files in the same dataset if they are give access to one. + (Data depositors would need to be aware of this limitation and could be guided to restrict all files/only grant access to all dataset files in Globus as a work-around). +- Dataverse-managed endpoints must be Globus "guest collections" hosted on either a file-system-based endpoint or an S3-based endpoint (the latter requires use of the Globus + S3 connector which requires a paid Globus subscription at the host institution). In either case, Dataverse is configured with the Globus credentials of a user account that can manage the endpoint. + Users will need their own Globus account, which can be obtained via their institution or directly from Globus (at no cost). +- With the file-system endpoint, Dataverse does not currently have access to the file contents. Thus, functionality related to ingest, previews, fixity hash validation, etc. are not available. + (Using the S3-based endpoint, Dataverse has access via S3 and all functionality normally associated with direct uploads to S3 is available. In this case admins should be sure to set the maximum size for ingest and avoid requiring hash validation at publication, etc.) +- For the reference use case, Dataverse must be configured with a list of allowed endpoint/base paths from which files may be referenced. In this case, since Dataverse is not accessing the remote endpoint itself, it does not need Globus credentials. + Users will also need a Globus account in this case, and the remote endpoint must be configured to allow them access, i.e. be publicly readable, or potentially supporting some out-of-band mechanism for access requests (which could be described, for example, in the dataset's Terms of Use and Access). +- As with remote stores, files can only be added in the Globus reference case via the Dataverse API. +- While Globus itself can handle many (millions of) files of any size, Dataverse cannot handle more than thousands of files per dataset (at best) and some Globus endpoints may have limits on file sizes - both maximums and minimums (e.g. for tape storage where small files are inefficient). + Users will need to be made aware of these limitations and the possibilities for managing them (e.g. by aggregating multiple files in a single, larger file, or storing smaller files in the base-store via the normal Dataverse upload UI). +- There is currently `a bug `_ that won't allow users to transfer files from/to endpoints where they do not have permission to list the overall file tree (i.e. an institution manages /institution_name but the user only has access to /institution_name/my_dir.) + Until that is fixed, a work-around is to first transfer data to an endpoint without this restriction. +- An alternative, experimental implementation of Globus polling of ongoing upload transfers was added in v6.4. This framework does not rely on the instance staying up continuously for the duration of the transfer and saves the state information about Globus upload requests in the database. While it is now the recommended option, it is not enabled by default. See the ``globus-use-experimental-async-framework`` feature flag (see :ref:`feature-flags`) and the JVM option :ref:`dataverse.files.globus-monitoring-server`. + +More details of the setup required to enable Globus is described in the `Community Dataverse-Globus Setup and Configuration document `_ and the references therein. + +An overview of the control and data transfer interactions between components was presented at the 2022 Dataverse Community Meeting and can be viewed in the `Integrations and Tools Session Video `_ around the 1 hr 28 min mark. + +See also :ref:`globus-support` and :ref:`Globus settings <:GlobusSettings>`. + + +Storage Strategy Recommendations +-------------------------------- + +Based on both file size and volume considerations, here are some general recommendations: + +1. **For research projects with moderate data (< 2GB files, < 100s of files/dataset):** + + * The default File Store is sufficient + * Consider setting file count and size limits (see below) + +2. **For projects with larger files (GBs to TBs) and/or more files per dataset (100s to 1000s):** + + * Configure an S3 store with direct upload/download + * Set appropriate ingest size limits + +3. **For projects with very large files or sensitive data that should remain in place:** + + * Use the Remote Store + +4. **For high-performance computing environments or very large datasets (TBs+):** + + * Use a Globus Store + * Work with users to size files appropriate to the underlying storage + * Consider Globus over S3 when normal upload/download options (via the UI/API) are desired along with Globus transfer + +5. **For Petascale datasets, or extreme numbers of files:** + + * Consider a Remote Store and referencing a single URL/Globus endpoint for the entire dataset + +6. **For Dataverse installations supporting a range of data scales:** + + * Consider using :ref:`multiple stores ` and assigning stores to individual collections or datasets + +Managing More Files per Dataset and More Datasets +------------------------------------------------- + +Dataverse can be configured to handle datasets with hundreds or thousands of files and hundreds of thousands of datasets. However, reaching these levels can require significant effort to appropriately configure the server. + +Technically, there are two factors that can limit scaling the number of files per dataset: how much the Dataverse server is involved in data transfer, and constraints based on Dataverse's code and database configuration. +The former is dramatically affected by the choice for file storage and options such as the S3 direct upload/download settings and ingest size limits. There are fewer ways to affect the latter beyond increasing the amount of memory and CPU resources available +or rewriting the relevant parts of Dataverse. (There are continuing efforts to improve Dataverse's performance and scaling, so it is also advisable to use the latest version if you are pushing the boundaries on scaling. Progress is being made.) + +Scaling to larger numbers of datasets (and to some extent scaling files per dataset) also depends on Dataverse's Solr search engine. There have been very significant improvements in indexing and search performance in recent releases, including some that are not turned on by default (listed below). + + + +Avoiding Many Files +~~~~~~~~~~~~~~~~~~~ + +Before describing things that can be done to improve scaling, it is important to note that there are configuration options and best practices to suggest to users to help avoid larger datasets and help them avoid hitting performance issues by going beyond the file counts you know work in your instance. + +There are a number of settings to limit how many files can be uploaded (see :ref:`database-settings` and :ref:`jvm-options` for more details): + +- :ref:`:ZipUploadFilesLimit` - the maximum number of files allowed in an uploaded zip file - only relevant for file stores and S3 when direct upload is not used. +- :ref:`:MultipleUploadFilesLimit` - the number of files the GUI user is allowed to upload in one batch, via drag-and-drop, or through the file select dialog +- :ref:`:MaxFileUploadSizeInBytes` - limit the size of files that can be uploaded +- :ref:`:UseStorageQuotas` - once enabled, super users can set per-collection quotas (in bytes) to limit the aggregate size of all files in the collection +- :ref:`dataverse.files.default-dataset-file-count-limit` - directly limits the number of files per dataset, can be changed per dataset via API (by super users) + +Scaling-related Configuration +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are a broad range of options (that are not turned on by default) for improving how well Solr indexing and searching scales and for handling more files per dataset. Some of these are useful for all installations while others are related to specific use cases, or are mostly for emergency use (e.g. disabling facets). +(see :ref:`database-settings`, :ref:`jvm-options`, and :ref:`feature-flags` for more details): + +- dataverse.feature.add-publicobject-solr-field=true - specifically marks unrestricted content as public in Solr. See :ref:`feature-flags`. +- dataverse.feature.avoid-expensive-solr-join=true - this tells Dataverse to use the feature above to speed up searches. See :ref:`feature-flags`. +- dataverse.feature.reduce-solr-deletes=true - when Solr entries are being updated, this avoids an unnecessary step (deletion of existing entries) for entries that are being replaced. See :ref:`feature-flags`. +- dataverse.feature.disable-dataset-thumbnail-autoselect=true - by default, Dataverse scans through all files in a dataset to find one that can be used as a thumbnail, which is expensive for many files. This disables that behavior to improve performance. See :ref:`feature-flags`. +- dataverse.feature.only-update-datacite-when-needed=true - reduces the load on DataCite and reduces Dataverse failures related to that load, which is important when using file PIDs on Datasets with many files. See :ref:`feature-flags`. +- :ref:`dataverse.solr.min-files-to-use-proxy` = - improve performance/lower memory requirements when indexing datasets with many files, suggested value is in the range 200 to 500 +- :ref:`dataverse.solr.concurrency.max-async-indexes` = - limits the number of index operations running in parallel. The default is 4, larger values may improve performance (if the Solr instance is appropriately sized) +- :ref:`:SolrFullTextIndexing` - false improves performance at the expense of not indexing file contents +- :ref:`:SolrMaxFileSizeForFullTextIndexing` - size in bytes (default unset/no limit) above which file contents should not be indexed +- :ref:`:ZipDownloadLimit` - the maximum size in bytes for zipped downloads of files from a dataset. If the size of requested files is larger, some files will be omitted and listed in the zip manifest file as not included. +- :ref:`:DatasetChecksumValidationSizeLimit` - by default, Dataverse checks fixity (assuring the file contents match the recorded checksum) as part of publication. This setting specifies a maximum aggregate dataset size, above which this validation will not be done. +- :ref:`:DataFileChecksumValidationSizeLimit` - by default, Dataverse checks fixity (assuring the file contents match the recorded checksum) as part of publication. This setting specifies a maximum file size, above which validation will not be done. +- :ref:`:FilePIDsEnabled` - false is recommended when datasets have many files. Related settings allow file PIDS to be enabled/disabled per collection and per dataset +- :ref:`:CustomZipDownloadServiceUrl` - allows use of a separate process/machine to handle zipping up multi-file downloads. Requires installation of the separate Zip Download app +- :ref:`:WebloaderUrl` - enables use of an installed DVWebloader (by specifying its web location) which is more efficient for uploading many files +- :ref:`:CategoryOrder` - Pre-sorts the file display by category, e.g. showing all "Documentation" files before "Data" files. Any user selected sorting by name, age, or size is done within these sections +- :ref:`:OrderByFolder` - pre-sorts files by their directory Label (folder), showing files with no path before others. Any user selected sorting by name, age, or size is done within these sections +- :ref:`:DisableSolrFacets` - disables facets, which are costly to generate, in search results (including the main collection page) +- :ref:`:DisableSolrFacetsForGuestUsers` - only disable facets for guests +- :ref:`:DisableSolrFacetsWithoutJsession` - disables facets for users who have disabled cookies (e.g. for bots) +- :ref:`:DisableUncheckedTypesFacet` - only disables the facet showing the number of collections, datasets, files matching the query (this facet is potentially less useful than others) +- :ref:`:StoreIngestedTabularFilesWithVarHeaders` - by default, Dataverse stores ingested files without headers and dynamically adds them back at download time. Once this setting is enabled, Dataverse will leave the headers in place (for newly ingested files), reducing the cost of downloads + + +Scaling Infrastructure +---------------------- + +There is no well-defined cut-off in terms of files per dataset or number of datasets where the Dataverse software will fail. In general the speed of viewing and editing a large dataset will decrease as the volume of datasets and files increases. +For a given installation, at some point, Dataverse will need more memory than is available, or will max out the CPU or other resources and performance may decline dramatically. + +In such cases: + +- Consider increasing the memory available to Dataverse (the Java heap size for the Payara instance) +- Consider a larger machine (more CPU resources) +- Verify that performance isn't being limited by Solr or Postgres +- Investigate performance tuning options for Payara, Solr, and Postgres +- Coordinate with others in the community - there is a lot of aggregate knowledge +- Consider contributing to software design changes - Dataverse scaling has improved dramatically over the past several years, but more can be done +- Watch for the new single page application (SPA) front-end for Dataverse. It includes features such as infinite scrolling through files with much faster initial page load times diff --git a/doc/sphinx-guides/source/admin/dataverses-datasets.rst b/doc/sphinx-guides/source/admin/dataverses-datasets.rst index a37819c90e1..c916b79aaa8 100644 --- a/doc/sphinx-guides/source/admin/dataverses-datasets.rst +++ b/doc/sphinx-guides/source/admin/dataverses-datasets.rst @@ -56,13 +56,19 @@ To direct new files (uploaded when datasets are created or edited) for all datas (Note that for ``dataverse.files.store1.label=MyLabel``, you should pass ``MyLabel``.) -The current driver can be seen using:: +A store assigned directly to a collection can be seen using:: curl -H "X-Dataverse-key: $API_TOKEN" http://$SERVER/api/admin/dataverse/$dataverse-alias/storageDriver -(Note that for ``dataverse.files.store1.label=MyLabel``, ``store1`` will be returned.) +This may be null. To get the effective storageDriver for a collection, which may be inherited from a parent collection or be the installation default, you can use:: -and can be reset to the default store with:: + curl -H "X-Dataverse-key: $API_TOKEN" http://$SERVER/api/admin/dataverse/$dataverse-alias/storageDriver?getEffective=true + +This will never be null. + +(Note that for ``dataverse.files.store1.label=MyLabel``, the JSON response will include "name":"store1" and "label":"MyLabel".) + +To delete a store assigned directly to a collection (so that the colllection's effective store is inherted from it's parent or is the global default), use:: curl -H "X-Dataverse-key: $API_TOKEN" -X DELETE http://$SERVER/api/admin/dataverse/$dataverse-alias/storageDriver @@ -257,15 +263,17 @@ To identify invalid data values in specific datasets (if, for example, an attemp Configure a Dataset to Store All New Files in a Specific File Store ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Configure a dataset to use a specific file store (this API can only be used by a superuser) :: +Configure an individual dataset to use a specific file store (this API can only be used by a superuser) :: curl -H "X-Dataverse-key: $API_TOKEN" -X PUT -d $storageDriverLabel http://$SERVER/api/datasets/$dataset-id/storageDriver -The current driver can be seen using:: +The effective store can be seen using:: curl http://$SERVER/api/datasets/$dataset-id/storageDriver -It can be reset to the default store as follows (only a superuser can do this) :: +The output of the API will include the id, label, type (for example, "file" or "s3") as well as the support for direct download and upload. + +To remove an assigned store, and allow the dataset to inherit the store from it's parent collection, use the following (only a superuser can do this) :: curl -H "X-Dataverse-key: $API_TOKEN" -X DELETE http://$SERVER/api/datasets/$dataset-id/storageDriver diff --git a/doc/sphinx-guides/source/admin/index.rst b/doc/sphinx-guides/source/admin/index.rst index a8a543571a7..4d2d5c22fc2 100755 --- a/doc/sphinx-guides/source/admin/index.rst +++ b/doc/sphinx-guides/source/admin/index.rst @@ -35,3 +35,4 @@ This guide documents the functionality only available to superusers (such as "da maintenance backups troubleshooting + big-data-administration diff --git a/doc/sphinx-guides/source/admin/integrations.rst b/doc/sphinx-guides/source/admin/integrations.rst index 8c627609af2..bb981c75ace 100644 --- a/doc/sphinx-guides/source/admin/integrations.rst +++ b/doc/sphinx-guides/source/admin/integrations.rst @@ -240,11 +240,6 @@ Discoverability A number of builtin features related to data discovery are listed under :doc:`discoverability` but you can further increase the discoverability of your data by setting up integrations. -SHARE -+++++ - -`SHARE `_ is building a free, open, data set about research and scholarly activities across their life cycle. It's possible to add a Dataverse installation as one of the `sources `_ they include if you contact the SHARE team. - Geodisy +++++++ diff --git a/doc/sphinx-guides/source/admin/make-data-count.rst b/doc/sphinx-guides/source/admin/make-data-count.rst index 0103a6f9e38..f8ffa7bb084 100644 --- a/doc/sphinx-guides/source/admin/make-data-count.rst +++ b/doc/sphinx-guides/source/admin/make-data-count.rst @@ -166,6 +166,8 @@ The example :download:`counter_weekly.sh <../_static/util/counter_weekly.sh>` wi Citations will be retrieved for each published dataset and recorded in the your Dataverse installation's database. +Note that the :ref:`dataverse.api.mdc.min-delay-ms` setting can be used to avoid getting rate-limit errors from DataCite. + For how to get the citations out of your Dataverse installation, see "Retrieving Citations for a Dataset" under :ref:`Dataset Metrics ` in the :doc:`/api/native-api` section of the API Guide. Please note that while the Dataverse Software has a metadata field for "Related Dataset" this information is not currently sent as a citation to Crossref. diff --git a/doc/sphinx-guides/source/admin/monitoring.rst b/doc/sphinx-guides/source/admin/monitoring.rst index 16bb18b7ad2..ef4e4f4f206 100644 --- a/doc/sphinx-guides/source/admin/monitoring.rst +++ b/doc/sphinx-guides/source/admin/monitoring.rst @@ -149,7 +149,7 @@ Tips: - Use **Enhanced Monitoring**. Enhanced Monitoring gathers its metrics from an agent on the instance. See `Enhanced Monitoring docs `_. - It's possible to view and act on **RDS Events** such as snapshots, parameter changes, etc. See `Working with Amazon RDS events `_ for details. - RDS monitoring is available via API and the ``aws`` command line tool. For example, see `Retrieving metrics with the Performance Insights API `_. -- To play with monitoring RDS using a server configured by `dataverse-ansible `_ set ``use_rds`` to true to skip some steps that aren't necessary when using RDS. See also the :doc:`/developers/deployment` section of the Developer Guide. +- To play with monitoring RDS using a server configured by `dataverse-ansible `_ set ``use_rds`` to true to skip some steps that aren't necessary when using RDS. See also the :doc:`/developers/deployment` section of the Developer Guide. MicroProfile Metrics endpoint ----------------------------- diff --git a/doc/sphinx-guides/source/api/changelog.rst b/doc/sphinx-guides/source/api/changelog.rst index 5be6c78adce..4c7a5914b1e 100644 --- a/doc/sphinx-guides/source/api/changelog.rst +++ b/doc/sphinx-guides/source/api/changelog.rst @@ -7,12 +7,22 @@ This API changelog is experimental and we would love feedback on its usefulness. :local: :depth: 1 +v6.9 +---- + +- The POST /api/admin/makeDataCount/{id}/updateCitationsForDataset processing is now asynchronous and the response no longer includes the number of citations. The response can be OK if the request is queued or 503 if the queue is full (default queue size is 1000). +- The way to set per-format size limits for tabular ingest has changed. JSON input is now used. See :ref:`:TabularIngestSizeLimit`. +- In the past, the settings API would accept any key and value. This is no longer the case because validation has been added. See :ref:`settings_put_single`, for example. +- For GET /api/notifications/all the JSON response has changed breaking the backward compatibility of the API. +- For GET /api/admin/dataverse/{dataverse-alias}/storageDriver and /api/datasets/{identifier}/storageDriver the driver name is no longer returned in data.message. Instead, it is returned as data.name (along with other information about the storageDriver). + v6.8 ---- - For POST /api/files/{id}/metadata passing an empty string ("description":"") or array ("categories":[]) will no longer be ignored. Empty fields will now clear out the values in the file's metadata. To ignore the fields simply do not include them in the JSON string. - For PUT /api/datasets/{id}/editMetadata the query parameter "sourceInternalVersionNumber" has been removed and replaced with "sourceLastUpdateTime" to verify that the data being edited hasn't been modified and isn't stale. -- For GET /api/dataverses/$dataverse-alias/links the Json response has changed breaking the backward compatibility of the API. +- For GET /api/dataverses/$dataverse-alias/links the JSON response has changed breaking the backward compatibility of the API. +- For PUT /api/dataverses/$dataverse-alias/inputLevels custom input levels that had been previously set will no longer be deleted. To delete input levels send an empty list (deletes all), then send the new/modified list. - For GET /api/externalTools and /api/externalTools/{id} the responses are now formatted as JSON (previously the toolParameters and allowedApiCalls were a JSON object and array (respectively) that were serialized as JSON strings) and any configured "requirements" are included. v6.7 diff --git a/doc/sphinx-guides/source/api/external-tools.rst b/doc/sphinx-guides/source/api/external-tools.rst index ae0e44b36aa..57a98a0c7c2 100644 --- a/doc/sphinx-guides/source/api/external-tools.rst +++ b/doc/sphinx-guides/source/api/external-tools.rst @@ -11,6 +11,9 @@ Introduction External tools are additional applications the user can access or open from your Dataverse installation to preview, explore, and manipulate data files and datasets. The term "external" is used to indicate that the tool is not part of the main Dataverse Software. +.. note:: + Browser-based tools must have CORS explicitly enabled via :ref:`dataverse.cors.origin `. List every origin that will host your tool (or use ``*`` when a wildcard is acceptable). If an origin is not listed, the browser will block that tool's API requests even if the tool page itself loads. + Once you have created the external tool itself (which is most of the work!), you need to teach a Dataverse installation how to construct URLs that your tool needs to operate. For example, if you've deployed your tool to fabulousfiletool.com your tool might want the ID of a file and the siteUrl of the Dataverse installation like this: https://fabulousfiletool.com?fileId=42&siteUrl=https://demo.dataverse.org In short, you will be creating a manifest in JSON format that describes not only how to construct URLs for your tool, but also what types of files your tool operates on, where it should appear in the Dataverse installation web interfaces, etc. @@ -202,7 +205,7 @@ Testing Your External Tool As the author of an external tool, you are not expected to learn how to install and operate a Dataverse installation. There's a very good chance your tool can be added to a server Dataverse Community developers use for testing if you reach out on any of the channels listed under :ref:`getting-help-developers` in the Developer Guide. -By all means, if you'd like to install a Dataverse installation yourself, a number of developer-centric options are available. For example, there's a script to spin up a Dataverse installation on EC2 at https://github.com/GlobalDataverseCommunityConsortium/dataverse-ansible . The process for using curl to add your external tool to your Dataverse installation is documented under :ref:`managing-external-tools` in the Admin Guide. +By all means, if you'd like to install a Dataverse installation yourself, a number of developer-centric options are available. For example, there's a script to spin up a Dataverse installation on EC2 at https://github.com/gdcc/dataverse-ansible . The process for using curl to add your external tool to your Dataverse installation is documented under :ref:`managing-external-tools` in the Admin Guide. Spreading the Word About Your External Tool ------------------------------------------- @@ -219,7 +222,7 @@ If you've thought to yourself that there ought to be an app store for Dataverse Demoing Your External Tool ++++++++++++++++++++++++++ -https://demo.dataverse.org is the place to play around with the Dataverse Software and your tool can be included. Please email support@dataverse.org to start the conversation about adding your tool. Additionally, you are welcome to open an issue at https://github.com/GlobalDataverseCommunityConsortium/dataverse-ansible which already includes a number of the tools listed above. +https://demo.dataverse.org is the place to play around with the Dataverse Software and your tool can be included. Please email support@dataverse.org to start the conversation about adding your tool. Additionally, you are welcome to open an issue at https://github.com/gdcc/dataverse-ansible which already includes a number of the tools listed above. Announcing Your External Tool +++++++++++++++++++++++++++++ diff --git a/doc/sphinx-guides/source/api/linkeddatanotification.rst b/doc/sphinx-guides/source/api/linkeddatanotification.rst index d55dc4da084..f3278196093 100644 --- a/doc/sphinx-guides/source/api/linkeddatanotification.rst +++ b/doc/sphinx-guides/source/api/linkeddatanotification.rst @@ -1,12 +1,17 @@ Linked Data Notification API ============================ -Dataverse has a limited, experimental API implementing a Linked Data Notification inbox allowing it to receive messages indicating a link between an external resource and a Dataverse dataset. +Dataverse has an API implementing a Linked Data Notification (LDN) inbox allowing it to receive messages implementing the `COAR Notify Relationship Announcement `_ indicating a link between an external resource and a Dataverse dataset. + +Dataverse has a related capability to send COAR Notify Relationship Announcement messages, automatically upon publication or manually. See the :doc:`/developers/workflows` section of the Guides. + The motivating use case is to support a use case where Dataverse administrators may wish to create back-links to the remote resource (e.g. as a Related Publication, Related Material, etc.). -Upon receipt of a relevant message, Dataverse will create Announcement Received notifications for superusers, who can edit the dataset involved. (In the motivating use case, these users may then add an appropriate relationship and use the Update Curent Version publishing option to add it to the most recently published version of the dataset.) +Upon receipt of a relevant message, Dataverse will create Announcement Received notifications for users who can edit the dataset involved. Notifications can be restricted to superusers who can publish the dataset as described below. (In the motivating use case, these superusers may then add an appropriate relationship and use the Update Curent Version publishing option to add it to the most recently published version of the dataset.) + +The ``dataverse.ldn.allowed-hosts`` JVM option is a comma-separated list of hosts from which Dataverse will accept and process messages. By default, no hosts are allowed. ``*`` can be used in testing to indicate all hosts are allowed. -The ``:LDNMessageHosts`` setting is a comma-separated whitelist of hosts from which Dataverse will accept and process messages. By default, no hosts are allowed. ``*`` can be used in testing to indicate all hosts are allowed. +The ``dataverse.ldn.coar-notify.relationship-announcement.notify-superusers-only`` JVM option can be set to ``true`` to restrict notifications to superusers only (those who can publish the dataset). The default is to notify all users who can publish the dataset. Messages can be sent via POST, using the application/ld+json ContentType: @@ -15,10 +20,12 @@ Messages can be sent via POST, using the application/ld+json ContentType: export SERVER_URL=https://demo.dataverse.org curl -X POST -H 'ContentType:application/ld+json' $SERVER_URL/api/inbox --upload-file message.jsonld + -The supported message format is described by `our preliminary specification `_. The format is expected to change in the near future to match the standard for relationship announcements being developed as part of `the COAR Notify Project `_. +The supported message format is described by `the COAR Notify Relationship Announcement specification `_. -An example message is shown below. It indicates that a resource with the name "An Interesting Title" exists and "IsSupplementedBy" the dataset with DOI https://doi.org/10.5072/FK2/GGCCDL. If this dataset is managed in the receiving Dataverse, a notification will be sent to user with the relevant permissions (as described above). +An example message is shown below. It indicates that a resource in the "Harvard DASH" test server has, as a "supplement", the dataset with DOI doi:10.5074/FKNOAHNQ. +If this dataset is managed in the receiving Dataverse, a notification will be sent to user with the relevant permissions (as described above). .. code:: json @@ -27,39 +34,44 @@ An example message is shown below. It indicates that a resource with the name "A "https://www.w3.org/ns/activitystreams", "https://purl.org/coar/notify" ], - "id": "urn:uuid:94ecae35-dcfd-4182-8550-22c7164fe23f", "actor": { - "id": "https://research-organisation.org/dspace", - "name": "DSpace Repository", + "id": "https://harvard-dash.staging.4science.cloud", + "name": "Harvard DASH", "type": "Service" }, "context": { - "IsSupplementedBy": - { - "id": "http://dev-hdc3b.lib.harvard.edu/dataset.xhtml?persistentId=doi:10.5072/FK2/GGCCDL", - "ietf:cite-as": "https://doi.org/10.5072/FK2/GGCCDL", - "type": "sorg:Dataset" - } + "id": "https://harvard-dash.staging.4science.cloud/handle/1/42718322", + "ietf:cite-as": "https://harvard-dash.staging.4science.cloud/handle/1/42718322", + "ietf:item": { + "id": "https://harvard-dash.staging.4science.cloud/bitstreams/e2ae80a1-35e5-411b-9ef1-9175f6cccf23/download", + "mediaType": "application/pdf", + "type": [ + "Article", + "sorg:ScholarlyArticle" + ] + }, + "type": "sorg:AboutPage" }, + "id": "urn:uuid:3c933c09-c246-473d-bea4-674db168cfee", "object": { - "id": "https://research-organisation.org/dspace/item/35759679-5df3-4633-b7e5-4cf24b4d0614", - "ietf:cite-as": "https://research-organisation.org/authority/resolve/35759679-5df3-4633-b7e5-4cf24b4d0614", - "sorg:name": "An Interesting Title", - "type": "sorg:ScholarlyArticle" + "as:object": "doi: 10.5074/FKNOAHNQ", + "as:relationship": "http://purl.org/vocab/frbr/core#supplement", + "as:subject": "https://harvard-dash.staging.4science.cloud/handle/1/42718322", + "id": "urn:uuid:0851f805-c52f-4d0b-81ac-a07e99c33e20", + "type": "Relationship" }, "origin": { - "id": "https://research-organisation.org/dspace", - "inbox": "https://research-organisation.org/dspace/inbox/", + "id": "https://harvard-dash.staging.4science.cloud", + "inbox": "https://harvard-dash.staging.4science.cloud/server/ldn/inbox", "type": "Service" }, "target": { - "id": "https://research-organisation.org/dataverse", - "inbox": "https://research-organisation.org/dataverse/inbox/", + "id": "http://ec2-3-238-245-253.compute-1.amazonaws.com/", + "inbox": "http://ec2-3-238-245-253.compute-1.amazonaws.com/api/inbox", "type": "Service" }, "type": [ "Announce", - "coar-notify:ReleaseAction" + "coar-notify:RelationshipAction" ] } - diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst index fa4b4611559..6c1720e2b5b 100644 --- a/doc/sphinx-guides/source/api/native-api.rst +++ b/doc/sphinx-guides/source/api/native-api.rst @@ -287,6 +287,51 @@ The fully expanded example above (without environment variables) looks like this curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" "https://demo.dataverse.org/api/dataverses/root/roles" +List the Allowed Metadata Languages of a Dataverse Collection +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Shows the allowed metadata languages of the Dataverse collection ``id``: + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export ID=root + + curl -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/dataverses/$ID/allowedMetadataLanguages" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" "https://demo.dataverse.org/api/dataverses/root/allowedMetadataLanguages" + +If there are no metadata languages configured on the server, this call returns an empty array. If the Dataverse collection has a mandatory metadata language, the return value is an array of that single language, +otherwise it's an array of all available metadata languages on the server. + +Set the Allowed Metadata Language of a Dataverse Collection +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Sets the allowed metadata language of the Dataverse collection ``id`` to ``langCode`` if it's available on the server: + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export ID=root + export LANGCODE=en + + curl -H "X-Dataverse-key:$API_TOKEN" -X PUT "$SERVER_URL/api/dataverses/$ID/allowedMetadataLanguages/$LANGCODE" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X PUT "https://demo.dataverse.org/api/dataverses/root/allowedMetadataLanguages/en" + +Returns an array of the set metadata language. +If the metadata language is not available on the server, this call responds with a 400 BAD REQUEST. + List Facets Configured for a Dataverse Collection ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -732,6 +777,50 @@ Note: you must have "Add Dataset" permission in the given collection to invoke t .. _featured-collections: +List Dataverse Collections to Which a Given Dataset or Dataverse Collection May Be Linked +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The user may provide a search term to limit the list of Dataverse Collections returned. The search term will be compared to the name of the Dataverse Collections. +The response is a JSON array of the ids, aliases, and names of the Dataverse collections to which a given Dataset or Dataverse Collection may be linked: + +For a given Dataverse Collection: + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export OBJECT_TYPE=dataverse + export ID=collectionAlias + export SEARCH_TERM=searchOn + + curl -H "X-Dataverse-key:$API_TOKEN" -X GET "$SERVER_URL/api/dataverses/$ID/$OBJECT_TYPE/linkingDataverses?searchTerm=$SEARCH_TERM" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X GET "https://demo.dataverse.org/api/dataverses/collectionAlias/dataverse/linkingDataverses?searchTerm=searchOn" + +For a given Dataset: + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export OBJECT_TYPE=dataset + export PERSISTENT_IDENTIFIER=doi:10.5072/FK2/J8SJZB + export SEARCH_TERM=searchOn + + curl -H "X-Dataverse-key:$API_TOKEN" -X GET "$SERVER_URL/api/dataverses/:persistentId/$OBJECT_TYPE/linkingDataverses?searchTerm=SEARCH_TERM&persistentId=$PERSISTENT_IDENTIFIER" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X GET "https://demo.dataverse.org/api/dataverses/:persistentId/dataset/linkingDataverses?searchTerm=searchOn&persistentId=doi:10.5072/FK2/J8SJZB" + +You may also add an optional "alreadyLinked=true" parameter to return collections which are already linked to the given Dataset or Dataverse Collection. + List Featured Collections for a Dataverse Collection ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -1106,7 +1195,8 @@ Update Collection Input Levels Updates the dataset field type input levels in a collection. -Please note that this endpoint overwrites all the input levels of the collection page, so if you want to keep the existing ones, you will need to add them to the JSON request body. +Please note that this endpoint does not change previously updated input levels of the collection page, so if you want to add new levels or modify existing ones, you will need to include them in the JSON request body. +In order to delete input levels you must call this API with an empty list to delete all of the input levels, then call this API with the new list of input levels. If one of the input levels corresponds to a dataset field type belonging to a metadata block that does not exist in the collection, the metadata block will be added to the collection. @@ -1160,16 +1250,22 @@ Collection Storage Quotas curl -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/dataverses/$ID/storage/quota" -Will output the storage quota allocated (in bytes), or a message indicating that the quota is not defined for the specific collection. The user identified by the API token must have the ``Manage`` permission on the collection. +Will output the storage quota allocated (in bytes), or a message indicating that the quota is not defined for the collection. If this is an unpublished collection, the user must have the ``ViewUnpublishedDataverse`` permission. +With an optional query parameter ``showInherited=true`` it will show the applicable quota potentially defined on the nearest parent when the collection does not have a quota configured directly. + +.. code-block:: + + curl -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/dataverses/$ID/storage/use" +Will output the dynamically cached total storage size (in bytes) used by the collection. The user identified by the API token must have the ``Edit`` permission on the collection. To set or change the storage allocation quota for a collection: .. code-block:: - curl -X POST -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/dataverses/$ID/storage/quota/$SIZE_IN_BYTES" + curl -X PUT -H "X-Dataverse-key:$API_TOKEN" -d $SIZE_IN_BYTES "$SERVER_URL/api/dataverses/$ID/storage/quota" -This is API is superuser-only. +This API is superuser-only. To delete a storage quota configured for a collection: @@ -1178,9 +1274,70 @@ To delete a storage quota configured for a collection: curl -X DELETE -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/dataverses/$ID/storage/quota" -This is API is superuser-only. +This API is superuser-only. + +Storage Quotas on Individual Datasets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: + + curl -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/datasets/$ID/storage/quota" + +Will output the storage quota allocated (in bytes), or a message indicating that the quota is not defined for this dataset. If this is an unpublished dataset, the user must have the ``ViewUnpublishedDataset`` permission. +With an optional query parameter ``showInherited=true`` it will show the applicable quota potentially defined on the nearest parent collection when the dataset does not have a quota configured directly. + +.. code-block:: + + curl -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/datasets/$ID/storage/use" + +Will output the dynamically cached total storage size (in bytes) used by the dataset. The user identified by the API token must have the ``Edit`` permission on the dataset. + +To set or change the storage allocation quota for a dataset: + +.. code-block:: + + curl -X PUT -H "X-Dataverse-key:$API_TOKEN" -d $SIZE_IN_BYTES "$SERVER_URL/api/datasets/$ID/storage/quota" + +This API is superuser-only. + + +To delete a storage quota configured for a dataset: + +.. code-block:: + + curl -X DELETE -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/datasets/$ID/storage/quota" + +This API is superuser-only. + +The following convenience API shows the dynamic values of the *remaining* storage size and/or file number quotas on the dataset, if present. For example: + +.. code-block:: + + curl -H "X-Dataverse-key: $API_TOKEN" "http://localhost:8080/api/datasets/$dataset-id/uploadlimits" + { + "status": "OK", + "data": { + "uploadLimits": { + "numberOfFilesRemaining": 20, + "storageQuotaRemaining": 1048576 + } + } + } + +Or, when neither limit is present: -Use the ``/settings`` API to enable or disable the enforcement of storage quotas that are defined across the instance via the following setting. For example, +.. code-block:: + + { + "status": "OK", + "data": { + "uploadLimits": {} + } + } + +This API requires the Edit permission on the dataset. + +Use the ``/settings`` API to enable or disable the enforcement of storage quotas that are defined across the instance via the following setting: .. code-block:: @@ -1440,6 +1597,78 @@ The fully expanded example above (without environment variables) looks like this curl -H "X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X POST "https://demo.dataverse.org/api/dataverses/1/templates" --upload-file dataverse-template.json + +Dataverse Role Assignment History +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Get the history of role assignments for a collection. This API call returns a list of role assignments and revocations for the specified dataset. + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export ID=1 + + curl -H "X-Dataverse-key:$API_TOKEN" -H "Accept: application/json" "$SERVER_URL/api/dataverses/$ID/assignments/history" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -H "Accept: application/json" "https://demo.dataverse.org/api/dataverses/3/assignments/history" + +You can also use the collection alias instead of the numeric id: + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export DV_ALIAS=dvAlias + + curl -H "X-Dataverse-key:$API_TOKEN" -H "Accept: application/json" "$SERVER_URL/api/dataverses/$DV_ALIAS/assignments/history" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -H "Accept: application/json" "https://demo.dataverse.org/api/datasets/dvAlias/assignments/history" + +The response is a JSON array of role assignment history entries with the following structure for each entry: + +.. code-block:: json + + { + "definedOn": "1", + "assigneeIdentifier": "@user1", + "roleName": "Admin", + "assignedBy": "@dataverseAdmin", + "assignedAt": "2023-01-01T12:00:00Z", + "revokedBy": null, + "revokedAt": null + } + +For revoked assignments, the "revokedBy" and "revokedAt" fields will contain values instead of null. + +To retrieve the history in CSV format, change the Accept header to "text/csv": + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export ID=3 + + curl -H "X-Dataverse-key:$API_TOKEN" -H "Accept: text/csv" "$SERVER_URL/api/dataverses/$ID/assignments/history" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -H "Accept: text/csv" "https://demo.dataverse.org/api/dataverses/3/assignments/history" + +The CSV response has column headers mirroring the JSON entries. They are internationalized (when internationalization is configured). + +Note: This feature requires the "role-assignment-history" feature flag to be enabled (see :ref:`feature-flags`). + Datasets -------- @@ -2098,14 +2327,26 @@ be available to users who have permission to view unpublished drafts. The api to export SERVER_URL=https://demo.dataverse.org export PERSISTENT_IDENTIFIER=doi:10.5072/FK2/BCCP9Z - curl -H "X-Dataverse-key: $API_TOKEN" -X PUT "$SERVER_URL/api/datasets/:persistentId/versions/compareSummary?persistentId=$PERSISTENT_IDENTIFIER" + curl -H "X-Dataverse-key: $API_TOKEN" -X GET "$SERVER_URL/api/datasets/:persistentId/versions/compareSummary?persistentId=$PERSISTENT_IDENTIFIER" The fully expanded example above (without environment variables) looks like this: .. code-block:: bash - curl -H "X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X PUT "https://demo.dataverse.org/api/datasets/:persistentId/versions/compareSummary?persistentId=doi:10.5072/FK2/BCCP9Z" + curl -H "X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X GET "https://demo.dataverse.org/api/datasets/:persistentId/versions/compareSummary?persistentId=doi:10.5072/FK2/BCCP9Z" + +You can control pagination of the results using the following optional query parameters. +* ``limit``: The maximum number of version differences to return. +* ``offset``: The number of version differences to skip from the beginning of the list. Used for retrieving subsequent pages of results. + +To aid in pagination the JSON response also includes the total number of rows (totalCount) available. + +For example, to get the second page of results, with 2 items per page, you would use ``limit=2`` and ``offset=2`` (skipping the first two results). + +.. code-block:: bash + + curl -H "X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X GET "https://demo.dataverse.org/api/datasets/:persistentId/versions/compareSummary?persistentId=doi:10.5072/FK2/BCCP9Z&limit=2&offset=2" Update Metadata For a Dataset ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2241,6 +2482,46 @@ For these deletes your JSON file must include an exact match of those dataset fi .. _publish-dataset-api: +Update Dataset Terms of Access +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Updates the terms of access for the restricted files of a dataset by applying it to the draft version, or by creating a draft if none exists. + + +To define custom terms of access, provide a JSON body with the following properties. All fields within ``customTermsOfAccess`` are optional, except if there are restricted files in your dataset then ``fileAccessRequest`` must be set to true or ``termsOfAccess`` must be provided: + +.. code-block:: json + + [ + { + "customTermsOfAccess": { + "fileAccessRequest": true, + "termsOfAccess": "Your terms of access for restricted files", + "dataAccessPlace": "Your data access place", + "originalArchive": "Your original archive", + "availabilityStatus": "Your availability status", + "contactForAccess": "Your contact for access", + "sizeOfCollection": "Your size of collection", + "studyCompletion": "Your study completion" + } + } + ] + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export ID=3 + export FILE_PATH=access.json + + curl -H "X-Dataverse-key:$API_TOKEN" -X PUT "$SERVER_URL/api/datasets/$ID/access" -H "Content-type:application/json" --upload-file $FILE_PATH + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X PUT "https://demo.dataverse.org/api/datasets/3/access" -H "Content-type:application/json" --upload-file access.json + Publish a Dataset ~~~~~~~~~~~~~~~~~ @@ -4039,7 +4320,7 @@ Delete files from a dataset. This API call allows you to delete multiple files f curl -H "X-Dataverse-key:$API_TOKEN" -X PUT "$SERVER_URL/api/datasets/:persistentId/deleteFiles?persistentId=$PERSISTENT_IDENTIFIER" \ -H "Content-Type: application/json" \ - -d '{"fileIds": [1, 2, 3]}' + -d '[1, 2, 3]' The fully expanded example above (without environment variables) looks like this: @@ -4047,17 +4328,193 @@ The fully expanded example above (without environment variables) looks like this curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X PUT "https://demo.dataverse.org/api/datasets/:persistentId/deleteFiles?persistentId=doi:10.5072/FK2ABCDEF" \ -H "Content-Type: application/json" \ - -d '{"fileIds": [1, 2, 3]}' + -d '[1, 2, 3]' -The ``fileIds`` in the JSON payload should be an array of file IDs that you want to delete from the dataset. +The JSON payload should be an array of file IDs that you want to delete from the dataset. You must have the appropriate permissions to delete files from the dataset. Upon success, the API will return a JSON response with a success message and the number of files deleted. The API call will report a 400 (BAD REQUEST) error if any of the files specified do not exist or are not in the latest version of the specified dataset. -The ``fileIds`` in the JSON payload should be an array of file IDs that you want to delete from the dataset. +The JSON payload should be an array of file IDs that you want to delete from the dataset. + +.. _api-dataset-role-assignment-history: + +Dataset Role Assignment History +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Get the history of role assignments for a dataset. This API call returns a list of role assignments and revocations for the specified dataset. + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export ID=3 + + curl -H "X-Dataverse-key:$API_TOKEN" -H "Accept: application/json" "$SERVER_URL/api/datasets/$ID/assignments/history" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -H "Accept: application/json" "https://demo.dataverse.org/api/datasets/3/assignments/history" + +You can also use the persistent identifier instead of the numeric id: + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export PERSISTENT_IDENTIFIER=doi:10.5072/FK2/ABCDEF + + curl -H "X-Dataverse-key:$API_TOKEN" -H "Accept: application/json" "$SERVER_URL/api/datasets/:persistentId/assignments/history?persistentId=$PERSISTENT_IDENTIFIER" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -H "Accept: application/json" "https://demo.dataverse.org/api/datasets/:persistentId/assignments/history?persistentId=doi:10.5072/FK2/ABCDEF" + +The response is a JSON array of role assignment history entries with the following structure for each entry: + +.. code-block:: json + + { + "definedOn": "3", + "assigneeIdentifier": "@user1", + "roleName": "Admin", + "assignedBy": "@dataverseAdmin", + "assignedAt": "2023-01-01T12:00:00Z", + "revokedBy": null, + "revokedAt": null + } + +For revoked assignments, the "revokedBy" and "revokedAt" fields will contain values instead of null. + +To retrieve the history in CSV format, change the Accept header to "text/csv": + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export ID=3 + + curl -H "X-Dataverse-key:$API_TOKEN" -H "Accept: text/csv" "$SERVER_URL/api/datasets/$ID/assignments/history" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -H "Accept: text/csv" "https://demo.dataverse.org/api/datasets/3/assignments/history" + +The CSV response has column headers mirroring the JSON entries. They are internationalized (when internationalization is configured). + +Note: This feature requires the "role-assignment-history" feature flag to be enabled (see :ref:`feature-flags`). + +Dataset Files Role Assignment History +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Get the history of role assignments for the files in a dataset. This API call returns a list of role assignments and revocations for all files in the specified dataset. + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export ID=3 + + curl -H "X-Dataverse-key:$API_TOKEN" -H "Accept: application/json" "$SERVER_URL/api/datasets/$ID/files/assignments/history" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -H "Accept: application/json" "https://demo.dataverse.org/api/datasets/3/files/assignments/history" + +You can also use the persistent identifier instead of the numeric id: + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export PERSISTENT_IDENTIFIER=doi:10.5072/FK2/ABCDEF + + curl -H "X-Dataverse-key:$API_TOKEN" -H "Accept: application/json" "$SERVER_URL/api/datasets/:persistentId/files/assignments/history?persistentId=$PERSISTENT_IDENTIFIER" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -H "Accept: application/json" "https://demo.dataverse.org/api/datasets/:persistentId/files/assignments/history?persistentId=doi:10.5072/FK2/ABCDEF" + +The JSON response for this call is the same as for the /api/datasets/{id}/assignments/history call above with the exception that definedOn will be a comma separated list of one or more file ids. + +To retrieve the history in CSV format, change the Accept header to "text/csv": + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export ID=3 + + curl -H "X-Dataverse-key:$API_TOKEN" -H "Accept: text/csv" "$SERVER_URL/api/datasets/files/$ID/assignments/history" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -H "Accept: text/csv" "https://demo.dataverse.org/api/datasets/3/files/assignments/history" + +The CSV response for this call is the same as for the /api/datasets/{id}/assignments/history call above with the exception that definedOn will be a comma separated list of one or more file ids. + +Note: This feature requires the "role-assignment-history" feature flag to be enabled (see :ref:`feature-flags`). + +Update Dataset License +~~~~~~~~~~~~~~~~~~~~~~ + +Updates the license of a dataset by applying it to the draft version, or by creating a draft if none exists. + +The JSON representation of a license can take two forms, depending on whether you want to specify a predefined license or define custom terms of use and access. + +To set a predefined license (e.g., CC BY 4.0), provide a JSON body with the license name: + +.. code-block:: json + + { + "name": "CC BY 4.0" + } + +To define custom terms of use and access, provide a JSON body with the following properties. All fields within ``customTerms`` are optional, except for the ``termsOfUse`` field, which is required: + +.. code-block:: json + + { + "customTerms": { + "termsOfUse": "Your terms of use", + "confidentialityDeclaration": "Your confidentiality declaration", + "specialPermissions": "Your special permissions", + "restrictions": "Your restrictions", + "citationRequirements": "Your citation requirements", + "depositorRequirements": "Your depositor requirements", + "conditions": "Your conditions", + "disclaimer": "Your disclaimer" + } + } + +.. code-block:: bash + + export API_TOKEN=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx + export SERVER_URL=https://demo.dataverse.org + export ID=3 + export FILE_PATH=license.json + + curl -H "X-Dataverse-key:$API_TOKEN" -X PUT "$SERVER_URL/api/datasets/$ID/license" -H "Content-type:application/json" --upload-file $FILE_PATH + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + curl -H "X-Dataverse-key:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X PUT "https://demo.dataverse.org/api/datasets/3/license" -H "Content-type:application/json" --upload-file license.json Files ----- @@ -4277,8 +4734,21 @@ The fully expanded example above (without environment variables) looks like this .. code-block:: bash - curl -X GET "https://demo.dataverse.org/api/files/1234/versionDifferences" - curl -X GET "https://demo.dataverse.org/api/files/:persistentId/versionDifferences?persistentId=doi:10.5072/FK2/J8SJZB" + curl -H "X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X GET "https://demo.dataverse.org/api/files/1234/versionDifferences" + curl -H "X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X GET "https://demo.dataverse.org/api/files/:persistentId/versionDifferences?persistentId=doi:10.5072/FK2/J8SJZB" + +You can control pagination of the results using the following optional query parameters. + +* ``limit``: The maximum number of version differences to return. +* ``offset``: The number of version differences to skip from the beginning of the list. Used for retrieving subsequent pages of results. + +To aid in pagination the JSON response also includes the total number of rows (totalCount) available. + +For example, to get the second page of results, with 2 items per page, you would use ``limit=2`` and ``offset=2`` (skipping the first two results). + +.. code-block:: bash + + curl -H "X-Dataverse-key: xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" -X GET "https://demo.dataverse.org/api/files/1234/versionDifferences?limit=2&offset=2" Adding Files ~~~~~~~~~~~~ @@ -5485,13 +5955,13 @@ Builtin users are known as "Username/Email and Password" users in the :doc:`/use Create a Builtin User ~~~~~~~~~~~~~~~~~~~~~ -For security reasons, builtin users cannot be created via API unless the team who runs the Dataverse installation has populated a database setting called ``BuiltinUsers.KEY``, which is described under :ref:`securing-your-installation` and :ref:`database-settings` sections of Configuration in the Installation Guide. You will need to know the value of ``BuiltinUsers.KEY`` before you can proceed. +For security reasons, builtin users cannot be created via API unless the team who runs the Dataverse installation has populated a database setting called ``:BuiltinUsersKey``, which is described under :ref:`securing-your-installation` and :ref:`database-settings` sections of Configuration in the Installation Guide. You will need to know the value of ``:BuiltinUsersKey`` before you can proceed. To create a builtin user via API, you must first construct a JSON document. You can download :download:`user-add.json <../_static/api/user-add.json>` or copy the text below as a starting point and edit as necessary. .. literalinclude:: ../_static/api/user-add.json -Place this ``user-add.json`` file in your current directory and run the following curl command, substituting variables as necessary. Note that both the password of the new user and the value of ``BuiltinUsers.KEY`` are passed as query parameters:: +Place this ``user-add.json`` file in your current directory and run the following curl command, substituting variables as necessary. Note that both the password of the new user and the value of ``:BuiltinUsersKey`` are passed as query parameters:: curl -d @user-add.json -H "Content-type:application/json" "$SERVER_URL/api/builtin-users?password=$NEWUSER_PASSWORD&key=$BUILTIN_USERS_KEY" @@ -6117,44 +6587,65 @@ The expected OK (200) response looks something like this: { "status": "OK", - "data": { - "notifications": [ - { - "id": 38, - "type": "CREATEACC", - "displayAsRead": true, - "subjectText": "Root: Your account has been created", - "messageText": "Hello, \nWelcome to...", - "sentTimestamp": "2025-07-21T19:15:37Z" - } + "totalCount": 15, + "data": [ + { + "id": 38, + "type": "CREATEACC", + "displayAsRead": true, + "subjectText": "Root: Your account has been created", + "messageText": "Hello, \nWelcome to...", + "sentTimestamp": "2025-07-21T19:15:37Z" + } ... -This endpoint supports an optional query parameter ``inAppNotificationFormat`` which, if sent as ``true``, retrieves the fields needed to build the in-app notifications for the Notifications section of the Dataverse UI, omitting fields related to email notifications. +This endpoint supports several optional query parameters to filter and paginate the results. + +The ``inAppNotificationFormat`` parameter, if sent as ``true``, retrieves the fields needed to build the in-app notifications for the Notifications section of the Dataverse UI, omitting fields related to email notifications. .. code-block:: bash curl -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/notifications/all?inAppNotificationFormat=true" -The expected OK (200) response looks something like this: +The ``onlyUnread`` parameter, if sent as ``true``, filters the results to include only notifications that have not been marked as read. + +.. code-block:: bash + + curl -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/notifications/all?onlyUnread=true" + +The ``limit`` and ``offset`` parameters can be used for pagination. ``limit`` specifies the maximum number of notifications to return, and ``offset`` specifies the number of notifications to skip from the beginning of the list. For example, to retrieve notifications 11 through 15: + +To aid in pagination the JSON response also includes the total number of rows (totalCount) available. + +.. code-block:: bash + + curl -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/notifications/all?limit=5&offset=10" + +All parameters can be combined. For instance, to get the first page of 10 unread notifications in the in-app format: + +.. code-block:: bash + + curl -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/notifications/all?inAppNotificationFormat=true&onlyUnread=true&limit=1&offset=0" + +The expected OK (200) response for an in-app format request looks something like this: .. code-block:: text { "status": "OK", - "data": { - "notifications": [ - { - "id": 79, - "type": "CREATEACC", - "displayAsRead": false, - "sentTimestamp": "2025-08-08T08:00:16Z", - "installationBrandName": "Your Installation Name", - "userGuidesBaseUrl": "https://guides.dataverse.org", - "userGuidesVersion": "6.7.1", - "userGuidesSectionPath": "user/index.html" - } - ] - } + "totalCount": 15, + "data": [ + { + "id": 79, + "type": "CREATEACC", + "displayAsRead": false, + "sentTimestamp": "2025-08-08T08:00:16Z", + "installationBrandName": "Your Installation Name", + "userGuidesBaseUrl": "https://guides.dataverse.org", + "userGuidesVersion": "6.7.1", + "userGuidesSectionPath": "user/index.html" + } + ] } ... @@ -6775,35 +7266,193 @@ If the PID is not managed by Dataverse, this call will report if the PID is reco Admin ----- -This is the administrative part of the API. For security reasons, it is absolutely essential that you block it before allowing public access to a Dataverse installation. Blocking can be done using settings. See the ``post-install-api-block.sh`` script in the ``scripts/api`` folder for details. See :ref:`blocking-api-endpoints` in Securing Your Installation section of the Configuration page of the Installation Guide. +This is the administrative part of the API. +For security reasons, it is absolutely essential that you block it before allowing public access to a Dataverse installation. +See :ref:`blocking-api-endpoints` in the Installation Guide for details. + +.. note:: See :ref:`curl-examples-and-environment-variables` if you are unfamiliar with the use of export below. + +.. _admin-api-db-settings: + +Manage Database Settings +~~~~~~~~~~~~~~~~~~~~~~~~ + +These are the API endpoints for managing the :ref:`database-settings` listed in the Installation Guide. + +.. _settings_get_all: List All Database Settings -~~~~~~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^^^^^^ -List all settings:: +.. code-block:: bash - GET http://$SERVER/api/admin/settings + export SERVER_URL="http://localhost:8080" + + curl "$SERVER_URL/api/admin/settings" -Configure Database Setting -~~~~~~~~~~~~~~~~~~~~~~~~~~ +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash -Sets setting ``name`` to the body of the request:: + curl http://localhost:8080/api/admin/settings - PUT http://$SERVER/api/admin/settings/$name +.. _settings_get_single: Get Single Database Setting -~~~~~~~~~~~~~~~~~~~~~~~~~~~ +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash -Get the setting under ``name``:: + export SERVER_URL="http://localhost:8080" + export NAME=":UploadMethods" + + curl "$SERVER_URL/api/admin/settings/$NAME" - GET http://$SERVER/api/admin/settings/$name +The fully expanded example above (without environment variables) looks like this: -Delete Database Setting -~~~~~~~~~~~~~~~~~~~~~~~ +.. code-block:: bash + + curl http://localhost:8080/api/admin/settings/:UploadMethods + +.. _settings_get_single_lang: + +Get Single Database Setting With Language/Locale +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A small number of settings, most notably :ref:`:ApplicationTermsOfUse`, can be saved in multiple languages. + +Use two-character ISO 639-1 language codes. + +.. code-block:: bash + + export SERVER_URL="http://localhost:8080" + export NAME=":ApplicationTermsOfUse" + export LANG="en" + + curl "$SERVER_URL/api/admin/settings/$NAME/lang/$LANG" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl http://localhost:8080/api/admin/settings/:ApplicationTermsOfUse/lang/en + +.. _settings_put_single: + +Configure Single Database Setting +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + export SERVER_URL="http://localhost:8080" + export NAME=":InstallationName" + export VALUE="LibreScholar" + + curl -X PUT "$SERVER_URL/api/admin/settings/$NAME" -d "$VALUE" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -X PUT http://localhost:8080/api/admin/settings/:InstallationName -d LibreScholar + +Note: ``NAME`` values are validated for existence and compliance. -Delete the setting under ``name``:: +.. _settings_put_single_lang: + +Configure Single Database Setting With Language/Locale +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A small number of settings, most notably :ref:`:ApplicationTermsOfUse`, can be saved in multiple languages. + +Use two-character ISO 639-1 language codes. + +.. code-block:: bash + + export SERVER_URL="http://localhost:8080" + export NAME=":ApplicationTermsOfUse" + export LANG="fr" + + curl -X PUT "$SERVER_URL/api/admin/settings/$NAME/lang/$LANG" --upload-file /tmp/apptou_fr.html + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -X PUT http://localhost:8080/api/admin/settings/:ApplicationTermsOfUse/lang/fr --upload-file /tmp/apptou_fr.html + +Note: ``NAME`` and ``LANG`` values are validated for existence and compliance. + +.. _settings_put_bulk: + +Configure All Database Settings +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Using a JSON file, replace all settings in a single idempotent and atomic operation and delete any settings not present in that JSON file. + +Use the JSON ``data`` object in output of ``GET /api/admin/settings`` (:ref:`settings_get_all`) for the JSON input structure for this endpoint. +To put this concretely, you can save just the ``data`` object for your existing settings to disk by filtering them through ``jq`` like this: + +.. code-block:: bash + + curl http://localhost:8080/api/admin/settings | jq '.data' > /tmp/all-settings.json + +Then you can use this "all-settings.json" file as a starting point for your input file. +The :doc:`../installation/config` page of the Installation Guide has a :ref:`complete list of all the available settings `. +Note that settings in the JSON file are validated for existence and compliance. + +.. code-block:: bash + + export SERVER_URL="http://localhost:8080" + + curl -X PUT -H "Content-type:application/json" "$SERVER_URL/api/admin/settings" --upload-file /tmp/all-settings.json + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -X PUT -H "Content-type:application/json" http://localhost:8080/api/admin/settings --upload-file /tmp/all-settings.json + +.. _settings_delete_single: + +Delete Single Database Setting +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + export SERVER_URL="http://localhost:8080" + export NAME=":InstallationName" + + curl -X DELETE "$SERVER_URL/api/admin/settings/$NAME" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash + + curl -X DELETE http://localhost:8080/api/admin/settings/:InstallationName + +.. _settings_delete_single_lang: + +Delete Single Database Setting With Language/Locale +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +A small number of settings, most notably :ref:`:ApplicationTermsOfUse`, can be saved in multiple languages. + +Use two-character ISO 639-1 language codes. + +.. code-block:: bash + + export SERVER_URL="http://localhost:8080" + export NAME=":ApplicationTermsOfUse" + export LANG="fr" + + curl -X DELETE "$SERVER_URL/api/admin/settings/$NAME/lang/$LANG" + +The fully expanded example above (without environment variables) looks like this: + +.. code-block:: bash - DELETE http://$SERVER/api/admin/settings/$name + curl -X DELETE http://localhost:8080/api/admin/settings/:ApplicationTermsOfUse/lang/fr .. _list-all-feature-flags: @@ -7680,7 +8329,7 @@ Get details of a workflow with a given id:: GET http://$SERVER/api/admin/workflows/$id -Add a new workflow. Request body specifies the workflow properties and steps in JSON format. +Add a new workflow. Request body specifies the workflow properties and steps in JSON format. Specifically, the body of the message should be a JSON Object with a String "name" for the workflow and a "steps" JSON Array containing a JSON Object per workflow step. (See :doc:`/developers/workflows` for the exiting steps and their required JSON representations.) Sample ``json`` files are available at ``scripts/api/data/workflows/``:: POST http://$SERVER/api/admin/workflows diff --git a/doc/sphinx-guides/source/conf.py b/doc/sphinx-guides/source/conf.py index abf6ba7379b..b6aadae5761 100755 --- a/doc/sphinx-guides/source/conf.py +++ b/doc/sphinx-guides/source/conf.py @@ -70,7 +70,7 @@ # built documents. # # The short X.Y version. -version = '6.8' +version = '6.9' # The full version, including alpha/beta/rc tags. release = version diff --git a/doc/sphinx-guides/source/container/app-image.rst b/doc/sphinx-guides/source/container/app-image.rst index afffeae1c0b..13133a9760f 100644 --- a/doc/sphinx-guides/source/container/app-image.rst +++ b/doc/sphinx-guides/source/container/app-image.rst @@ -17,11 +17,6 @@ Within the main repository, you may find the application image's files at ```_ to build and ship the image within a special Maven profile. -**NOTE: This image is created, maintained and supported by the Dataverse community on a best-effort basis.** -IQSS will not offer you support how to deploy or run it, please reach out to the community for help on using it. -You might be interested in taking a look at :doc:`../developers/containers`, linking you to some (community-based) -efforts. - .. _app-image-supported-tags: Supported Image Tags @@ -81,7 +76,7 @@ For now, stale images will be kept on Docker Hub indefinitely. | Example: :substitution-code:`|nextVersion|-noble` | Summary: Rolling tag, equivalent to ``unstable`` for current development cycle. Will roll over to the rolling production tag after a Dataverse release. - | Discussion: Perhaps you are eager to starting testing features of an upcoming version (e.g. |nextVersion|) in a staging environment. You select the :substitution-code:`|nextVersion|-noble` tag (as opposed to ``unstable``) because you want to stay on |nextVersion| rather switching to the version **after that** when a release is made (which would happen if you had selected the ``unstable`` tag). Also, when the next release comes out (|nextVersion| in this example), you would stay on the :substitution-code:`|nextVersion|-noble` tag, which is the same tag that someone would use who wants the final release of |nextVersion|. (See "Rolling Production", above.) + | Discussion: Perhaps you are eager to start testing features of an upcoming version (e.g. |nextVersion|) in a staging environment. You select the :substitution-code:`|nextVersion|-noble` tag (as opposed to ``unstable``) because you want to stay on |nextVersion| rather than switching to the version **after that** when a release is made (which would happen if you had selected the ``unstable`` tag). Also, when the next release comes out (|nextVersion| in this example), you would stay on the :substitution-code:`|nextVersion|-noble` tag, which is the same tag that someone would use who wants the final release of |nextVersion|. (See "Rolling Production", above.) **NOTE**: In these tags for development usage, the version number will always be 1 minor version ahead of existing Dataverse releases. Example: Assume Dataverse ``6.x`` is released, ``6.(x+1)`` is underway. diff --git a/doc/sphinx-guides/source/container/base-image.rst b/doc/sphinx-guides/source/container/base-image.rst index 4adc6bb6fb1..8b8079c7b6e 100644 --- a/doc/sphinx-guides/source/container/base-image.rst +++ b/doc/sphinx-guides/source/container/base-image.rst @@ -16,11 +16,6 @@ Within the main repository, you may find the base image's files at ``/ This Maven module uses the `Maven Docker Plugin `_ to build and ship the image. You may use, extend, or alter this image to your liking and/or host in some different registry if you want to. -**NOTE: This image is created, maintained and supported by the Dataverse community on a best-effort basis.** -IQSS will not offer you support how to deploy or run it, please reach out to the community (:ref:`support`) for help on using it. -You might be interested in taking a look at :doc:`../developers/containers`, linking you to some (community-based) -efforts. - .. _base-image-supported-tags: Supported Image Tags diff --git a/doc/sphinx-guides/source/container/dev-usage.rst b/doc/sphinx-guides/source/container/dev-usage.rst index c02c1d4010f..7bce7158cb3 100644 --- a/doc/sphinx-guides/source/container/dev-usage.rst +++ b/doc/sphinx-guides/source/container/dev-usage.rst @@ -1,7 +1,7 @@ Development Usage ================= -Please note! This Docker setup is not for production! +Please note! This Docker setup is not for :doc:`production `! .. contents:: |toctitle| :local: @@ -145,13 +145,13 @@ Accessing Harvesting Log Files \1. Open a terminal and access the Dataverse container. -Run the following command to access the Dataverse container (assuming your container is named dataverse-1): +Run the following command to access the Dataverse container: .. code-block:: - docker exec -it dataverse-1 bash + docker exec -it dev_dataverse bash -This command opens an interactive shell within the dataverse-1 container. +This command opens an interactive shell within the dev_dataverse container. \2. Navigate to the log files directory. @@ -233,6 +233,13 @@ Hotswapping methods requires using JDWP (Debug Mode), but does not allow switchi **Requires IntelliJ Ultimate!** (Note that `free educational licenses `_ are available) + Go to settings, then plugins. Install "Payara Ultimate Tools". For more information: + + - `plugin homepage `_ + - `docs `_ + - `source `_ + - `issues `_ + .. image:: img/intellij-payara-plugin-install.png #. Configure a connection to Payara: @@ -284,6 +291,7 @@ Hotswapping methods requires using JDWP (Debug Mode), but does not allow switchi You might want to tweak the hot deploy behavior in the "Server" tab now. "Update action" can be found in the run window (see below). + By default it is "Hot Swap classes", which works fine, but as the screenshot shows you can also change it to "Redeploy". "Frame deactivation" means switching from IntelliJ window to something else, e.g. your browser. *Note: static resources like properties, XHTML etc will only update when redeploying!* @@ -305,7 +313,11 @@ Hotswapping methods requires using JDWP (Debug Mode), but does not allow switchi See cheat sheet above for more options. Note that this command either assumes you built the :doc:`app-image` first or will download it from Docker Hub. .. group-tab:: IntelliJ - You can create a service configuration to automatically start services for you. + Note that you can skip this step if you're ok running the command under the "Maven" tab, which is this: + + ``mvn -Pct docker:run -Dapp.skipDeploy`` + + In IntelliJ you can create a service configuration to automatically start services for you. **IMPORTANT**: This requires installation of the `Docker plugin `_. @@ -362,7 +374,7 @@ Hotswapping methods requires using JDWP (Debug Mode), but does not allow switchi .. image:: img/intellij-payara-run-output.png - Manually hotswap classes in "Debug" mode via "Run" > "Debugging Actions" > "Reload Changed Classes". + Manually hotswap classes in "Debug" mode via "Run" > "Debugging Actions" > "Compile and Reload Modified Files". .. image:: img/intellij-payara-run-menu-reload.png diff --git a/doc/sphinx-guides/source/container/intro.rst b/doc/sphinx-guides/source/container/intro.rst index 5099531dcc9..77187f2b40b 100644 --- a/doc/sphinx-guides/source/container/intro.rst +++ b/doc/sphinx-guides/source/container/intro.rst @@ -9,7 +9,7 @@ Dataverse in containers! Intended Audience ----------------- -This guide is intended for anyone who wants to run Dataverse in containers. This is potentially a wide audience, from sysadmins interested in running Dataverse in production in containers (not recommended yet) to contributors working on a bug fix (encouraged!). See :doc:`running/index` for various scenarios and please let us know if your use case is not covered. +This guide is intended for anyone who wants to run Dataverse in containers. This is potentially a wide audience, from sysadmins interested in running Dataverse in production in containers to contributors working on a bug fix. See :doc:`running/index` for various scenarios and please let us know if your use case is not covered. .. _getting-help-containers: diff --git a/doc/sphinx-guides/source/container/running/demo.rst b/doc/sphinx-guides/source/container/running/demo.rst index d4afee8a18a..32de0ea48bf 100644 --- a/doc/sphinx-guides/source/container/running/demo.rst +++ b/doc/sphinx-guides/source/container/running/demo.rst @@ -261,7 +261,7 @@ You should be able to see the new fields from the metadata block you added in th ``curl http://localhost:8983/solr/collection1/schema/fields`` -At this point you can proceed with testing the metadata block in the Dataverse UI. First you'll need to enable it for a collection (see :ref:`general-information` in the User Guide section about collection). Afterwards, create a new dataset, save it, and then edit the metadata for that dataset. Your metadata block should appear. +At this point you can proceed with testing the metadata block in the Dataverse UI. First you'll need to enable it for a collection (see :ref:`general-information` in the User Guide section about collections). Afterwards, create a new dataset, save it, and then edit the metadata for that dataset. Your metadata block should appear. Next Steps ---------- diff --git a/doc/sphinx-guides/source/container/running/production.rst b/doc/sphinx-guides/source/container/running/production.rst index 4fe16447d7e..786851267e9 100644 --- a/doc/sphinx-guides/source/container/running/production.rst +++ b/doc/sphinx-guides/source/container/running/production.rst @@ -1,5 +1,13 @@ -Production (Future) -=================== +Production +========== + +.. _production-security-warning: + +.. warning:: + + The :doc:`demo` tutorial is **NOT SECURE BY DEFAULT**. It uses public, hardcoded passwords and secrets for demonstration purposes only. + + If you use the demo as a structural template, you MUST replace all default secrets before deploying your instance. Failure to do so will result in a vulnerable production environment. .. contents:: |toctitle| :local: @@ -7,21 +15,15 @@ Production (Future) Status ------ -The images described in this guide are not yet recommended for production usage, but we think we are close. (Tagged releases are done; see the "supported image tags" section for :ref:`Application ` and :ref:`Config Baker ` images.) For now, please see :doc:`demo`. - -We'd like to make the following improvements: - -- More docs on setting up additional features - - - How to set up Rserve. +As of Dataverse 6.8, when we introduced image tagging per version (see the :ref:`app-image-supported-tags` section for the :ref:`application image `), we feel that the images described in this guide are ready for production use. Enjoy! -- Go through all the features in docs and check what needs to be done differently with containers +The images and the documentation is not perfect, of course. - - Check ports, for example. +For now, we recommend following the :doc:`demo` as a structural template. Note that instead of "latest" you might want to select a specific version. Again see :ref:`app-image-supported-tags`. -To join the discussion on what else might be needed before declaring images ready for production, please comment on https://dataverse.zulipchat.com/#narrow/stream/375812-containers/topic/containers.20for.20production/near/434979159 +The Dataverse guides were originally written with a non-Docker installation in mind so we'd like rewrite them with both Docker and non-Docker in mind. This is a big job, obviously. 😅 We know we'd like to write more about ports. We'd like to explain `how to set up Rserve `_. Etc., etc. -You are also very welcome to join our meetings. See "how to help" below. +To talk about your ideas for making the images and docs better for production, please feel free to join the `containers for production `_ topic or join a working group meeting (see :ref:`helping-containers`). Limitations ----------- @@ -31,9 +33,9 @@ Limitations How to Help ----------- -You can help the effort to support these images in production by trying them out (see :doc:`demo`) and giving feedback (see :ref:`helping-containers`). +Please try the images (see :doc:`demo`) and give feedback (see :ref:`helping-containers`)! ❤️ Alternatives ------------ -Until the images are ready for production, please use the traditional installation method described in the :doc:`/installation/index`. +The traditional (non-Docker) installation method is described in the :doc:`/installation/index`. diff --git a/doc/sphinx-guides/source/contributor/documentation.md b/doc/sphinx-guides/source/contributor/documentation.md index 2a8d6794921..3d95a11c24f 100644 --- a/doc/sphinx-guides/source/contributor/documentation.md +++ b/doc/sphinx-guides/source/contributor/documentation.md @@ -50,76 +50,78 @@ If you would like to read more about the Dataverse's use of GitHub, please see t ## Building the Guides with Sphinx -While the "quick fix" technique shown above should work fine for minor changes, especially for larger changes, we recommend installing Sphinx on your computer or using a Sphinx Docker container to build the guides locally so you can get an accurate preview of your changes. +While the "quick fix" technique shown above should work fine for minor changes, in many cases, you're going to want to preview changes locally before committing them. -In case you decide to use a Sphinx Docker container to build the guides, you can skip the next two installation sections, but you will need to have Docker installed. +Before we worry about pushing changes to the code, let's make sure we can build the guides. -### Installing Sphinx +Go to and click "Code" and then follow the instructions to clone the code locally. -First, make a fork of and clone your fork locally. Then change to the ``doc/sphinx-guides`` directory. +### Docker -``cd doc/sphinx-guides`` +Install [Docker Desktop](https://www.docker.com/products/docker-desktop/). -Create a Python virtual environment, activate it, then install dependencies: +From a terminal, switch to the "dataverse" directory you just cloned. This is the root of the git repo. -``python3 -m venv venv`` +`cd dataverse` -``source venv/bin/activate`` +Then try running this command: -``pip install -r requirements.txt`` +`docker run -it --rm -v $(pwd):/docs sphinxdoc/sphinx:7.4.0 bash -c "cd doc/sphinx-guides && pip3 install -r requirements.txt && make html"` -### Installing GraphViz +If all goes well, you should be able to open `doc/sphinx-guides/build/html/index.html` to see the guides you just built. -In some parts of the documentation, graphs are rendered as images using the Sphinx GraphViz extension. +#### Docker with a Makefile -Building the guides requires the ``dot`` executable from GraphViz. +Once you've confirmed you have Docker working, if you have [make](https://en.wikipedia.org/wiki/Make_(software)) installed, you can try the following commands: -This requires having [GraphViz](https://graphviz.org) installed and either having ``dot`` on the path or -[adding options to the `make` call](https://groups.google.com/forum/#!topic/sphinx-users/yXgNey_0M3I). +`make docs-html` -On a Mac we recommend installing GraphViz through [Homebrew](). Once you have Homebrew installed and configured to work with your shell, you can type `brew install graphviz`. +`make docs-pdf` -### Editing and Building the Guides +`make docs-epub` -To edit the existing documentation: - -- Create a branch (see {ref}`how-to-make-a-pull-request`). -- In ``doc/sphinx-guides/source`` you will find the .rst files that correspond to https://guides.dataverse.org. -- Using your preferred text editor, open and edit the necessary files, or create new ones. +`make docs-all` -Once you are done, you can preview the changes by building the guides locally. As explained, you can build the guides with Sphinx locally installed, or with a Docker container. +### Sphinx Installed Locally -#### Building the Guides with Sphinx Installed Locally +First, run `python --version` or `python3 --version` to determine the version of Python you have. If you don't have Python 3.10 or higher, you must upgrade. -Open a terminal, change directories to `doc/sphinx-guides`, activate (or reactivate) your Python virtual environment, and build the guides. +Next, change to the `doc/sphinx-guides` directory. `cd doc/sphinx-guides` +Create a Python virtual environment, activate it, then install dependencies: + +`python3 -m venv venv` + `source venv/bin/activate` -`make clean` +`pip install -r requirements.txt` -`make html` +Next, install [GraphViz](https://graphviz.org) because building the guides requires having the `dot` executable from GraphViz either on the path or passed [as an argument](https://groups.google.com/g/sphinx-users/c/yXgNey_0M3I/m/3T2NipFlBgAJ). -#### Building the Guides with a Sphinx Docker Container and a Makefile +On a Mac we recommend installing GraphViz through [Homebrew](). Once you have Homebrew installed and configured to work with your shell, you can type `brew install graphviz`. -We have added a Makefile to simplify the process of building the guides using a Docker container, you can use the following commands from the repository root: +Finally, you can try building the guides with the following command. -- `make docs-html` -- `make docs-pdf` -- `make docs-epub` -- `make docs-all` +`make html` -#### Building the Guides with a Sphinx Docker Container and CLI +If all goes well, you should be able to open `doc/sphinx-guides/build/html/index.html` to see the guides you just built. -If you want to build the guides using a Docker container, execute the following command in the repository root: +## Editing, Building, and Previewing the Guides -`docker run -it --rm -v $(pwd):/docs sphinxdoc/sphinx:7.2.6 bash -c "cd doc/sphinx-guides && pip3 install -r requirements.txt && make html"` +To edit the existing documentation: + +- Create a branch (see {ref}`how-to-make-a-pull-request`). +- In `doc/sphinx-guides/source` you will find the .rst or .md files that correspond to https://guides.dataverse.org. +- Using your preferred text editor, open and edit the necessary files, or create new ones. -#### Previewing the Guides +Once you are done, you can preview the changes by building the guides using one of the options above. After Sphinx is done processing the files you should notice that the `html` folder in `doc/sphinx-guides/build` directory has been updated. You can click on the files in the `html` folder to preview the changes. +## Making a Pull Request + Now you can make a commit with the changes to your own fork in GitHub and submit a pull request. See {ref}`how-to-make-a-pull-request`. ## Writing Guidelines @@ -153,16 +155,22 @@ If the page is written in Markdown (.md), use this form: ### Links -Getting links right with .rst files can be tricky. +Getting links right can be tricky. #### Custom Titles -You can use a custom title when linking to a document like this: +In .rst files you can use a custom title when linking to a document like this: :doc:`Custom title ` See also +In .md files, the same pattern can be used. Here's an example of using a custom title with a ref: + + {ref}`Log in ` + +See also + ### Images A good documentation is just like a website enhanced and upgraded by adding high quality and self-explanatory images. Often images depict a lot of written text in a simple manner. Within our Sphinx docs, you can add them in two ways: a) add a PNG image directly and include or b) use inline description languages like GraphViz (current only option). @@ -179,7 +187,7 @@ The HTML version of the guides is the official one. Any other formats are mainta If you would like to build a PDF version of the guides and have Docker installed, please try the command below from the root of the git repo: -`docker run -it --rm -v $(pwd):/docs sphinxdoc/sphinx-latexpdf:7.2.6 bash -c "cd doc/sphinx-guides && pip3 install -r requirements.txt && make latexpdf LATEXMKOPTS=\"-interaction=nonstopmode\"; cd ../.. && ls -1 doc/sphinx-guides/build/latex/Dataverse.pdf"` +`docker run -it --rm -v $(pwd):/docs sphinxdoc/sphinx-latexpdf:7.4.0 bash -c "cd doc/sphinx-guides && pip3 install -r requirements.txt && make latexpdf LATEXMKOPTS=\"-interaction=nonstopmode\"; cd ../.. && ls -1 doc/sphinx-guides/build/latex/Dataverse.pdf"` A few notes about the command above: diff --git a/doc/sphinx-guides/source/developers/big-data-support.rst b/doc/sphinx-guides/source/developers/big-data-support.rst index 75a50e2513d..b2724ce01e3 100644 --- a/doc/sphinx-guides/source/developers/big-data-support.rst +++ b/doc/sphinx-guides/source/developers/big-data-support.rst @@ -57,6 +57,15 @@ Allow CORS for S3 Buckets **IMPORTANT:** One additional step that is required to enable direct uploads via a Dataverse installation and for direct download to work with previewers and direct upload to work with dvwebloader (:ref:`folder-upload`) is to allow cross site (CORS) requests on your S3 store. The example below shows how to enable CORS rules (to support upload and download) on a bucket using the AWS CLI command line tool. Note that you may want to limit the AllowedOrigins and/or AllowedHeaders further. https://github.com/gdcc/dataverse-previewers/wiki/Using-Previewers-with-download-redirects-from-S3 has some additional information about doing this. +Dataverse itself will only emit the necessary ``Access-Control-*`` headers to browsers when CORS has been explicitly enabled via the JVM/MicroProfile setting :ref:`dataverse.cors.origin `. You must both: + +* Configure an appropriate ``dataverse.cors.origin`` value (single origin, comma-separated list, or ``*``) on the Dataverse application server; and +* Configure a matching/compatible CORS policy on each S3 bucket (and any CDN/proxy in front of it) that will be used for direct upload or for redirect (download-redirect) operations consumed by previewers. + +If you specify multiple origins in ``dataverse.cors.origin`` Dataverse will echo back the requesting origin (when it matches) and will include ``Vary: Origin`` so that shared caches do not serve one origin's response to another. If you configure ``*`` Dataverse will respond with ``Access-Control-Allow-Origin: *`` (note that browsers will not allow credentialed requests with a wildcard). + +Make sure the bucket CORS configuration ``AllowedOrigins`` is at least as permissive as the origins you configure in ``dataverse.cors.origin``. If the bucket allows ``*`` but the Dataverse application only allows a subset, the browser will still enforce the more restrictive application response. + If you'd like to check the CORS configuration on your bucket before making changes: ``aws s3api get-bucket-cors --bucket `` @@ -156,7 +165,7 @@ Globus File Transfer Note: Globus file transfer is still experimental but feedback is welcome! See :ref:`support`. Users can transfer files via `Globus `_ into and out of datasets, or reference files on a remote Globus endpoint, when their Dataverse installation is configured to use a Globus accessible store(s) -and a community-developed `dataverse-globus `_ app has been properly installed and configured. +and a community-developed `dataverse-globus `_ app has been properly installed and configured. Globus endpoints can be in a variety of places, from data centers to personal computers. This means that from within the Dataverse software, a Globus transfer can feel like an upload or a download (with Globus Personal Connect running on your laptop, for example) or it can feel like a true transfer from one server to another (from a cluster in a data center into a Dataverse dataset or vice versa). @@ -187,6 +196,6 @@ As described in that document, Globus transfers can be initiated by choosing the An overview of the control and data transfer interactions between components was presented at the 2022 Dataverse Community Meeting and can be viewed in the `Integrations and Tools Session Video `_ around the 1 hr 28 min mark. -See also :ref:`Globus settings <:GlobusSettings>`. +See also :ref:`Globus settings <:GlobusSettings>` and :ref:`globus-stores`. An alternative, experimental implementation of Globus polling of ongoing upload transfers has been added in v6.4. This framework does not rely on the instance staying up continuously for the duration of the transfer and saves the state information about Globus upload requests in the database. Due to its experimental nature it is not enabled by default. See the ``globus-use-experimental-async-framework`` feature flag (see :ref:`feature-flags`) and the JVM option :ref:`dataverse.files.globus-monitoring-server`. diff --git a/doc/sphinx-guides/source/developers/classic-dev-env.rst b/doc/sphinx-guides/source/developers/classic-dev-env.rst index d1f54fd9d5f..5dc68325767 100755 --- a/doc/sphinx-guides/source/developers/classic-dev-env.rst +++ b/doc/sphinx-guides/source/developers/classic-dev-env.rst @@ -93,15 +93,15 @@ On Linux, install ``jq`` from your package manager or download a binary from htt Install Payara ~~~~~~~~~~~~~~ -Payara 6.2025.3 or higher is required. +Payara 6.2025.10 or higher is required. To install Payara, run the following commands: ``cd /usr/local`` -``sudo curl -O -L https://nexus.payara.fish/repository/payara-community/fish/payara/distributions/payara/6.2025.3/payara-6.2025.3.zip`` +``sudo curl -O -L https://nexus.payara.fish/repository/payara-community/fish/payara/distributions/payara/6.2025.10/payara-6.2025.10.zip`` -``sudo unzip payara-6.2025.3.zip`` +``sudo unzip payara-6.2025.10.zip`` ``sudo chown -R $USER /usr/local/payara6`` diff --git a/doc/sphinx-guides/source/developers/deployment.rst b/doc/sphinx-guides/source/developers/deployment.rst index 46cf95dae54..ec9929136b7 100755 --- a/doc/sphinx-guides/source/developers/deployment.rst +++ b/doc/sphinx-guides/source/developers/deployment.rst @@ -78,7 +78,7 @@ Amazon offers instructions on using an IAM role to grant permissions to applicat Configure Ansible File (Optional) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In order to configure Dataverse installation settings such as the password of the dataverseAdmin user, download https://raw.githubusercontent.com/GlobalDataverseCommunityConsortium/dataverse-ansible/master/defaults/main.yml and edit the file to your liking. +In order to configure Dataverse installation settings such as the password of the dataverseAdmin user, download https://raw.githubusercontent.com/gdcc/dataverse-ansible/develop/defaults/main.yml and edit the file to your liking. You can skip this step if you're fine with the values in the "main.yml" file in the link above. @@ -89,7 +89,7 @@ Once you have done the configuration above, you are ready to try running the "ec Download `ec2-create-instance.sh`_ and put it somewhere reasonable. For the purpose of these instructions we'll assume it's in the "Downloads" directory in your home directory. -.. _ec2-create-instance.sh: https://raw.githubusercontent.com/GlobalDataverseCommunityConsortium/dataverse-ansible/master/ec2/ec2-create-instance.sh +.. _ec2-create-instance.sh: https://raw.githubusercontent.com/gdcc/dataverse-ansible/develop/ec2/ec2-create-instance.sh To run the script, you can make it executable (``chmod 755 ec2-create-instance.sh``) or run it with bash, like this with ``-h`` as an argument to print the help: diff --git a/doc/sphinx-guides/source/developers/making-library-releases.rst b/doc/sphinx-guides/source/developers/making-library-releases.rst index be867f9196a..e63c998b837 100755 --- a/doc/sphinx-guides/source/developers/making-library-releases.rst +++ b/doc/sphinx-guides/source/developers/making-library-releases.rst @@ -13,7 +13,9 @@ Note: See :doc:`making-releases` for Dataverse itself. We release Java libraries to Maven Central that are used by Dataverse (and perhaps `other `_ `software `_!): - https://central.sonatype.com/namespace/org.dataverse +- https://central.sonatype.com/namespace/org.dataverse.test - https://central.sonatype.com/namespace/io.gdcc +- https://central.sonatype.com/namespace/io.gdcc.export We release JavaScript/TypeScript libraries to npm: @@ -36,6 +38,32 @@ Releasing a Snapshot Version to Maven Central That is to say, to make a snapshot release, you only need to get one or more commits into the default branch. +It's possible, of course, to make snapshot releases outside of GitHub Actions, from environments such as your laptop. Generally, you'll want to look at the GitHub Action and try to do the equivalent. You'll need a file set up locally at ``~/.m2/settings.xml`` with the following (contact a core developer for the redacted bits): + +.. code-block:: bash + + + + + central + REDACTED + REDACTED + + + + +Then, study the GitHub Action and perform similar commands from your local environment. For example, as of this writing, for the dataverse-spi project, you can run the following commands, substituting the suffix you need: + +``mvn -f modules/dataverse-spi -Dproject.version.suffix="2.1.0-PR11767-SNAPSHOT" verify`` + +``mvn -f modules/dataverse-spi -Dproject.version.suffix="2.1.0-PR11767-SNAPSHOT" deploy`` + +This will upload the snapshot here, for example: https://central.sonatype.com/repository/maven-snapshots/io/gdcc/dataverse-spi/2.1.02.1.0-PR11767-SNAPSHOT/dataverse-spi-2.1.02.1.0-PR11767-20250827.182026-1.jar + +Before OSSRH was retired, you could browse through snapshot jars you published at https://s01.oss.sonatype.org/content/repositories/snapshots/io/gdcc/dataverse-spi/2.0.0-PR9685-SNAPSHOT/, for example. Now, even though you may see the URL of the jar as shown above during the "deploy" step, if you try to browse the various snapshot jars at https://central.sonatype.com/repository/maven-snapshots/io/gdcc/dataverse-spi/2.1.02.1.0-PR11767-SNAPSHOT/ you'll see "This maven2 hosted repository is not directly browseable at this URL. Please use the browse or HTML index views to inspect the contents of this repository." Sadly, the "browse" and "HTML index" links don't work, as noted in a `question `_ on the Sonatype Community forum. Below is a suggestion for confirming that the jar was uploaded properly, which is to use Maven to copy the jar to your local directory. You could then compare checksums. + +``mvn dependency:copy -DrepoUrl=https://central.sonatype.com/repository/maven-snapshots/ -Dartifact=io.gdcc:dataverse-spi:2.1.02.1.0-PR11767-SNAPSHOT -DoutputDirectory=.`` + Releasing a Release (Non-Snapshot) Version to Maven Central ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -83,60 +111,18 @@ Releasing a New Library to Maven Central At a high level: - Start with a snapshot release. -- Use an existing pom.xml as a starting point. -- Use existing GitHub Actions workflows as a starting point. -- Create secrets in the new library's GitHub repo used by the workflow. -- If you need an entire new namespace, look at previous issues such as https://issues.sonatype.org/browse/OSSRH-94575 and https://issues.sonatype.org/browse/OSSRH-94577 +- Use an existing pom.xml as a starting point, such as from `Croissant `_, that inherits from the common Maven parent (https://github.com/gdcc/maven-parent). You can also play around with the "hello" project (https://github.com/gdcc/hello) and even make releases from it since it is designed to be a sandbox for publishing to Maven Central. +- Use existing GitHub Actions workflows as a starting point, such as from `Croissant `_. As of this writing we have separate actions for ``maven-snapshot.yml`` and ``maven-release.yml``. +- For repos under https://github.com/IQSS, create secrets in the new library's GitHub repo used by the workflow. This is necessary for the IQSS org because "organization secrets are not available for organizations on legacy per-repository billing plans." For repos under https://github.com/gdcc you can make use of shared secrets at the org level. These are the environment variables we use: -Updating pom.xml for a Snapshot Release -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + - DATAVERSEBOT_GPG_KEY -Before publishing a final version to Maven Central, you should publish a snapshot release or two. For each snapshot release you publish, the jar name will be unique each time (e.g. ``foobar-0.0.1-20240430.175110-3.jar``), so you can safely publish over and over with the same version number. + - DATAVERSEBOT_GPG_PASSWORD -We use the `Nexus Staging Maven Plugin `_ to push snapshot releases to https://s01.oss.sonatype.org/content/groups/staging/io/gdcc/ and https://s01.oss.sonatype.org/content/groups/staging/org/dataverse/ + - DATAVERSEBOT_SONATYPE_TOKEN -Add the following to your pom.xml: - -.. code-block:: xml - - 0.0.1-SNAPSHOT - - - - ossrh - https://s01.oss.sonatype.org/content/repositories/snapshots - - - ossrh - https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/ - - - - - org.sonatype.plugins - nexus-staging-maven-plugin - ${nexus-staging.version} - true - - ossrh - https://s01.oss.sonatype.org - true - - - -Configuring Secrets -~~~~~~~~~~~~~~~~~~~ - -In GitHub, you will likely need to configure the following secrets: - -- DATAVERSEBOT_GPG_KEY -- DATAVERSEBOT_GPG_PASSWORD -- DATAVERSEBOT_SONATYPE_TOKEN -- DATAVERSEBOT_SONATYPE_USERNAME - -Note that some of these secrets might be configured at the org level (e.g. gdcc or IQSS). - -Many of the automated tasks are performed by the dataversebot account on GitHub: https://github.com/dataversebot + - DATAVERSEBOT_SONATYPE_USERNAME +- If you need an entire new namespace, look at previous issues such as https://issues.sonatype.org/browse/OSSRH-94575 and https://issues.sonatype.org/browse/OSSRH-94577 npm (JavaScript/TypeScript) --------------------------- diff --git a/doc/sphinx-guides/source/developers/making-releases.rst b/doc/sphinx-guides/source/developers/making-releases.rst index 028b80e2892..fbbc2e5d3ae 100755 --- a/doc/sphinx-guides/source/developers/making-releases.rst +++ b/doc/sphinx-guides/source/developers/making-releases.rst @@ -8,9 +8,13 @@ Making Releases Introduction ------------ -This document is about releasing the main Dataverse app (https://github.com/IQSS/dataverse). See :doc:`making-library-releases` for how to release our various libraries. Other projects have their own release documentation. +.. note:: This document is about releasing the main Dataverse app (https://github.com/IQSS/dataverse). See :doc:`making-library-releases` for how to release our various libraries. Other projects have their own release documentation. -Below you'll see branches like "develop" and "master" mentioned. For more on our branching strategy, see :doc:`version-control`. +.. note:: Below you'll see branches like "develop" and "master" mentioned. For more on our branching strategy, see :doc:`version-control`. + +Dataverse releases are time-based as opposed to being feature-based. That is, we announce an approximate release date in advance (e.g. for `6.8 `_) and try to hit that deadline. If features we're working on aren't ready yet, the train will leave the station without them. We release quarterly. + +We also announce "last call" dates for both community pull requests and those made by core developers. If you are part of the community and have made a pull request, you have until this date to ask the team to add the upcoming milestone to your pull request. The same goes for core developers. This is not a guarantee that these pull requests will be reviewed, tested, QA'ed and merged before :ref:`code freeze `, but we'll try. Regular or Hotfix? ------------------ @@ -30,7 +34,9 @@ Early on, make sure it's clear what type of release this is. The steps below des Ensure Issues Have Been Created ------------------------------- -Some of the steps in this document are well-served by having their own dedicated GitHub issue. You'll see a label like this on them: +We have a "create release issues" script at https://github.com/IQSS/dv-project-metrics that should be run a week or so before code freeze. + +For each issue that is created by the script there is likely a corresponding step in this document that has "dedicated" label on it like this: |dedicated| @@ -41,35 +47,39 @@ There are a variety of reasons why a step might deserve its own dedicated issue: Steps don't get their own dedicated issue if it would be confusing to have multiple people involved. Too many cooks in the kitchen, as they say. Also, some steps are so small the overhead of an issue isn't worth it. -Before the release even begins you can coordinate with the project manager about the creation of these issues. - .. |dedicated| raw:: html Dedicated Issue   +.. _declare-code-freeze: + Declare a Code Freeze --------------------- -The following steps are made more difficult if code is changing in the "develop" branch. Declare a code freeze until the release is out. Do not allow pull requests to be merged. +When we declare a code freeze, we mean: -For a hotfix, a code freeze (no merging) is necessary not because we want code to stop changing in the branch being hotfix released, but because bumping the version used in Jenkins/Ansible means that API tests will fail in pull requests until the version is bumped in those pull requests. +- No additional features will be merged until the freeze is lifted. +- Bug fixes will only be merged if they relate to the upcoming release in some way, such as fixes for regressions or performance problems in that release. +- Pull requests that directly affect the release, such as bumping the version, will be merged, of course. -Conduct Performance Testing ---------------------------- +The benefits of the code freeze are: -|dedicated| +- The team can focus on getting the release out together. +- Regression and performance testing can happen on code that isn't changing. +- The release notes can be written without having to worry about new features (and their release note snippets) being merged in. -See :doc:`/qa/performance-tests` for details. +In short, the steps described below become easier under a code freeze. -Conduct Regression Testing ---------------------------- +Note: for a hotfix, a code freeze is necessary not because we want code to stop changing in the branch being hotfix released, but because bumping the version used in Jenkins/Ansible means that API tests will fail in pull requests until the version is bumped in those pull requests. Basically, we want to get the hotfix merged quickly so we can propagate the version bump into all open pull requests so that API tests can start passing again in those pull requests. -|dedicated| +Push Back Milestones on Pull Requests That Missed the Train +----------------------------------------------------------- -See :doc:`/qa/testing-approach` for details. -Refer to the provided regression checklist for the list of items to verify during the testing process: `Regression Checklist `_. +As of this writing, we optimistically add milestones to issues and pull requests, hoping that the work will be complete before code freeze. Inevitably, we're a bit too optimistic. + +Hopefully, as the release approached, the team has already decided which pull requests (that aren't related to the release) won't make the cut. If not, go ahead and bump them to the next release. .. _write-release-notes: @@ -85,7 +95,7 @@ The task at or near release time is to collect these snippets into a single file - Find the issue in GitHub that tracks the work of creating release notes for the upcoming release. - Create a branch, add a .md file for the release (ex. 5.10.1 Release Notes) in ``/doc/release-notes`` and write the release notes, making sure to pull content from the release note snippets mentioned above. Snippets may not include any issue number or pull request number in the text so be sure to copy the number from the filename of the snippet into the final release note. - Delete (``git rm``) the release note snippets as the content is added to the main release notes file. -- Include instructions describing the steps required to upgrade the application from the previous version. These must be customized for release numbers and special circumstances such as changes to metadata blocks and infrastructure. +- Include instructions describing the steps required to upgrade the application from the previous version. These must be customized for release numbers and special circumstances such as changes to metadata blocks and infrastructure. These instructions are required for the next steps (deploying to various environments) so try to prioritize them over finding just the right words in release highlights (which you can do later). - Make a pull request. Here's an example: https://github.com/IQSS/dataverse/pull/11613 - Note that we won't merge the release notes until after we have confirmed that the upgrade instructions are valid by performing a couple upgrades. @@ -110,12 +120,58 @@ ssh into the dataverse-internal server and download the release candidate war fi Go to /doc/release-notes, open the release-notes.md file for the release we're working on, and perform all the steps under "Upgrade Instructions". Note that for regular releases, we haven't bumped the version yet so you won't be able to follow the steps exactly. (For hotfix releases, the version will be bumped already.) +Deploy Release Candidate to QA +------------------------------ + +|dedicated| + +Deploy the same war file to https://qa.dataverse.org using the same upgrade instructions as above. + +Solicit Feedback from Curation Team +----------------------------------- + +Ask the curation team to test on https://qa.dataverse.org and give them five days to provide feedback. + + +Conduct Performance Testing +--------------------------- + +|dedicated| + +See :doc:`/qa/performance-tests` for details. + +Conduct Regression Testing +--------------------------- + +|dedicated| + +Regression testing should be conducted on production data. +See :doc:`/qa/testing-approach` for details. +Refer to the provided regression checklist for the list of items to verify during the testing process: `Regression Checklist `_. + +Build the Guides for the Release Candidate +------------------------------------------ + +Go to https://jenkins.dataverse.org/job/guides.dataverse.org/ and make the following adjustments to the config: + +- Repository URL: ``https://github.com/IQSS/dataverse.git`` +- Branch Specifier (blank for 'any'): ``*/develop`` +- ``VERSION`` (under "Build Steps"): use the next release version but add "-rc.1" to the end. Don't prepend a "v". Use ``6.8-rc.1`` (for example) + +Click "Save" then "Build Now". + +Make sure the guides directory appears in the expected location such as https://guides.dataverse.org/en/6.8-rc.1/ + +When previewing the HTML version of docs from pull requests, we don't usually use this Jenkins job, relying instead on automated ReadTheDocs builds. The reason for doing this step now while we wait for feedback from the Curation Team is that it's an excellent time to fix the Jenkins job, if necessary, to accommodate any changes needed to continue to build the docs. For example, Sphinx might need to be updated or a dependency might need to be installed. Such changes should be listed in the release notes for documentation writers. + Deploy Release Candidate to Demo -------------------------------- |dedicated| -Deploy the same war file to https://demo.dataverse.org using the same upgrade instructions as above. +Time has passed. The curation team has given feedback. We've finished regression and performance testing. Fixes may have been merged into the "develop" branch. We're ready to actually make the release now, which includes deploying a release candidate to the demo server. + +Build a new war file, if necessary, and deploy it to https://demo.dataverse.org using the upgrade instructions in the release notes. Merge Release Notes (Once Ready) -------------------------------- @@ -171,16 +227,25 @@ Merge "develop" into "master" (non-hotfix only) If this is a regular (non-hotfix) release, create a pull request to merge the "develop" branch into the "master" branch using this "compare" link: https://github.com/IQSS/dataverse/compare/master...develop -Once important tests have passed (compile, unit tests, etc.), merge the pull request (skipping code review is ok). Don't worry about style tests failing such as for shell scripts. +Allow time for important tests (compile, unit tests, etc.) to pass. Don't worry about style tests failing such as for shell scripts. It's ok to skip code review. + +When merging the pull request, be sure to choose "create a merge commit" and not "squash and merge" or "rebase and merge". We suspect that choosing squash or rebase may have led to `lots of merge conflicts `_ when we tried to perform this "merge develop to master" step, forcing us to `re-do `_ the previous release before we could proceed with the current release. If this is a hotfix release, skip this whole "merge develop to master" step (the "develop" branch is not involved until later). +Confirm "master" Mergeability +----------------------------- + +Hopefully, the previous step went ok. As a sanity check, use the "compare" link at https://github.com/IQSS/dataverse/compare/master...develop again to look for merge conflicts without making a pull request. + +If the GitHub UI tells you there would be merge conflicts, something has gone horribly wrong (again) with the "merge develop to master" step. Stop and ask for help. + Add Milestone to Pull Requests and Issues ----------------------------------------- Often someone is making sure that the proper milestone (e.g. 5.10.1) is being applied to pull requests and issues, but sometimes this falls between the cracks. -Check for merged pull requests that have no milestone by going to https://github.com/IQSS/dataverse/pulls and entering `is:pr is:merged no:milestone `_ as a query. If you find any, add the milestone to the pull request and any issues it closes. This includes the "merge develop into master" pull request above. +Check for merged pull requests that have no milestone by going to https://github.com/IQSS/dataverse/pulls and entering `is:pr is:merged no:milestone `_ as a query. If you find any, first check if those pull requests are against open pull requests. If so, do nothing. Otherwise, add the milestone to the pull request and any issues it closes. This includes the "merge develop into master" pull request above. (Optional) Test Docker Images ----------------------------- diff --git a/doc/sphinx-guides/source/developers/testing.rst b/doc/sphinx-guides/source/developers/testing.rst index 1690864d453..733a0b0ba28 100755 --- a/doc/sphinx-guides/source/developers/testing.rst +++ b/doc/sphinx-guides/source/developers/testing.rst @@ -128,7 +128,7 @@ You might find studying the following test classes helpful in writing tests for - DeletePrivateUrlCommandTest.java - GetPrivateUrlCommandTest.java -In addition, there is a writeup on "The Testable Command" at https://github.com/IQSS/dataverse/blob/develop/doc/theTestableCommand/TheTestableCommand.md . +In addition, there is a writeup on "The Testable Command" at https://github.com/IQSS/dataverse/blob/master/doc/theTestableCommand/TheTestableCommand.md . Running Non-Essential (Excluded) Unit Tests ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -169,12 +169,12 @@ different people. For our purposes, an integration test can have two flavors: Running the Full API Test Suite Using EC2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -**Prerequisite:** To run the API test suite in an EC2 instance you should first follow the steps in the :doc:`deployment` section to get set up with the AWS binary to launch EC2 instances. If you're here because you just want to spin up a branch, you'll still want to follow the AWS deployment setup steps, but may find the `ec2-create README.md `_ Quick Start section helpful. +**Prerequisite:** To run the API test suite in an EC2 instance you should first follow the steps in the :doc:`deployment` section to get set up with the AWS binary to launch EC2 instances. If you're here because you just want to spin up a branch, you'll still want to follow the AWS deployment setup steps, but may find the `ec2-create README.md `_ Quick Start section helpful. -You may always retrieve a current copy of the ec2-create-instance.sh script and accompanying group_var.yml file from the `dataverse-ansible repo `_. Since we want to run the test suite, let's grab the group_vars used by Jenkins: +You may always retrieve a current copy of the ec2-create-instance.sh script and accompanying group_var.yml file from the `dataverse-ansible repo `_. Since we want to run the test suite, let's grab the group_vars used by Jenkins: -- `ec2-create-instance.sh `_ -- `jenkins.yml `_ +- `ec2-create-instance.sh `_ +- `jenkins.yml `_ Edit ``jenkins.yml`` to set the desired GitHub repo and branch, and to adjust any other options to meet your needs: @@ -184,7 +184,7 @@ Edit ``jenkins.yml`` to set the desired GitHub repo and branch, and to adjust an - ``dataverse.unittests.enabled: true`` - ``dataverse.sampledata.enabled: true`` -If you wish, you may pass the script a ``-l`` flag with a local relative path in which the script will `copy various logs `_ at the end of the test suite for your review. +If you wish, you may pass the script a ``-l`` flag with a local relative path in which the script will `copy various logs `_ at the end of the test suite for your review. Finally, run the script: @@ -209,7 +209,7 @@ The Burrito Key For reasons that have been lost to the mists of time, the Dataverse software really wants you to to have a burrito. Specifically, if you're trying to run REST Assured tests and see the error "Dataverse config issue: No API key defined for built in user management", you must run the following curl command (or make an equivalent change to your database): -``curl -X PUT -d 'burrito' http://localhost:8080/api/admin/settings/BuiltinUsers.KEY`` +``curl -X PUT -d 'burrito' http://localhost:8080/api/admin/settings/:BuiltinUsersKey`` Without this "burrito" key in place, REST Assured will not be able to create users. We create users to create objects we want to test, such as collections, datasets, and files. @@ -526,7 +526,7 @@ Browser-Based Testing Installation Testing ~~~~~~~~~~~~~~~~~~~~ -- Work with @donsizemore to automate testing of https://github.com/GlobalDataverseCommunityConsortium/dataverse-ansible +- Work with @donsizemore to automate testing of https://github.com/gdcc/dataverse-ansible Future Work on Load/Performance Testing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -538,4 +538,4 @@ Future Work on Load/Performance Testing Future Work on Accessibility Testing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- Using https://github.com/GlobalDataverseCommunityConsortium/dataverse-ansible and hooks available from accessibility testing tools, automate the running of accessibility tools on PRs so that developers will receive quicker feedback on proposed code changes that reduce the accessibility of the application. +- Using https://github.com/gdcc/dataverse-ansible and hooks available from accessibility testing tools, automate the running of accessibility tools on PRs so that developers will receive quicker feedback on proposed code changes that reduce the accessibility of the application. diff --git a/doc/sphinx-guides/source/developers/tips.rst b/doc/sphinx-guides/source/developers/tips.rst index 9295f3a8d12..f3046c9018c 100755 --- a/doc/sphinx-guides/source/developers/tips.rst +++ b/doc/sphinx-guides/source/developers/tips.rst @@ -124,7 +124,7 @@ Here's an example of using these credentials from within the PostgreSQL containe .. code-block:: bash - pdurbin@beamish dataverse % docker exec -it postgres-1 bash + pdurbin@beamish dataverse % docker exec -it dev_postgres bash root@postgres:/# export PGPASSWORD=secret root@postgres:/# psql -h localhost -U dataverse dataverse psql (16.3 (Debian 16.3-1.pgdg120+1)) diff --git a/doc/sphinx-guides/source/developers/workflows.rst b/doc/sphinx-guides/source/developers/workflows.rst index 38ca6f4e141..6f562cd9dd1 100644 --- a/doc/sphinx-guides/source/developers/workflows.rst +++ b/doc/sphinx-guides/source/developers/workflows.rst @@ -27,6 +27,8 @@ If a step in a workflow fails, the Dataverse installation makes an effort to rol provider offers two steps for sending and receiving customizable HTTP requests. *http/sr* and *http/authExt*, detailed below, with the latter able to use the API to make changes to the dataset being processed. (Both lock the dataset to prevent other processes from changing the dataset between the time the step is launched to when the external process responds to the Dataverse instance.) +.. _workflow_admin: + Administration ~~~~~~~~~~~~~~ @@ -36,6 +38,8 @@ At the moment, defining a workflow for each trigger is done for the entire insta In order to prevent unauthorized resuming of workflows, the Dataverse installation maintains a "white list" of IP addresses from which resume requests are honored. This list is maintained using the ``/api/admin/workflows/ip-whitelist`` endpoint of the :doc:`/api/native-api`. By default, the Dataverse installation honors resume requests from localhost only (``127.0.0.1;::1``), so set-ups that use a single server work with no additional configuration. +Note: these settings are also exposed and manageable via the Settings API. +See :ref:`:WorkflowsAdminIpWhitelist`, :ref:`:PrePublishDatasetWorkflowId` and :ref:`:PostPublishDatasetWorkflowId` Available Steps ~~~~~~~~~~~~~~~ @@ -202,16 +206,16 @@ Note - the example step includes two settings required for any archiver, three ( } -ldnannounce -+++++++++++ +coarNotifyRelationshipAnnouncement +++++++++++++++++++++++++++++++++++ -An experimental step that sends a Linked Data Notification (LDN) message to a specific LDN Inbox announcing the publication/availability of a dataset meeting certain criteria. +A step that sends a `COAR Notify Relationship Announcement `_ message, using the `Linked Data Notification (LDN) `_ message standard, +to a specific set of LDN Inboxes announcing a relationship between a newly published/available dataset and an external resource (e.g. one managed by the recipient). The two parameters are -* ``:LDNAnnounceRequiredFields`` - a list of metadata fields that must exist to trigger the message. Currently, the message also includes the values for these fields but future versions may only send the dataset's persistent identifier (making the receiver responsible for making a call-back to get any metadata). -* ``:LDNTarget`` - a JSON object containing an ``inbox`` key whose value is the URL of the target LDN inbox to which messages should be sent, e.g. ``{"id": "https://dashv7-dev.lib.harvard.edu","inbox": "https://dashv7-api-dev.lib.harvard.edu/server/ldn/inbox","type": "Service"}`` ). - -The supported message format is desribed by `our preliminary specification `_. The format is expected to change in the near future to match the standard for relationship announcements being developed as part of `the COAR Notify Project `_. +* ``:COARNotifyRelationshipAnnouncementTriggerFields`` - a list of metadata field types that can trigger messages. Separate messages will be sent for each field (whether of the same type or not). +* ``:COARNotifyRelationshipAnnouncementTargets`` - a JSON Array of JSON objects containing ``id``, ``inbox``, and ``type`` fields as required by the `COAR Notify Relationship Announcement specification `_ . +The ``inbox`` value should be the full URL of the target LDN inbox to which messages should be sent, e.g. ``{"id": "https://dashv7-dev.lib.harvard.edu","inbox": "https://dashv7-api-dev.lib.harvard.edu/server/ldn/inbox","type": ["Service"]}`` ). .. code:: json @@ -219,13 +223,13 @@ The supported message format is desribed by `our preliminary specification `_ and their APIs. You need to provide the same credentials (``username``, ``password``) to Dataverse software to mint and manage DOIs for you. As noted above, you should use one of the more secure options for setting the password. +The `only-update-datacite-when-needed feature` flag is a global option that causes Dataverse to GET the latest metadata from DataCite +for a DOI and compare it with the current metadata in Dataverse and only sending a following POST request if needed. This potentially +substitutes a read for an unnecessary write at DataCite, but would result in extra reads when all metadata in Dataverse is new. +Setting the flag to "true" is recommended when using DataCite file DOIs. + CrossRef-specific Settings ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -781,7 +807,7 @@ Both Local and Remote Auth The ``authenticationproviderrow`` database table controls which "authentication providers" are available within a Dataverse installation. Out of the box, a single row with an id of "builtin" will be present. For each user in a Dataverse installation, the ``authenticateduserlookup`` table will have a value under ``authenticationproviderid`` that matches this id. For example, the default "dataverseAdmin" user will have the value "builtin" under ``authenticationproviderid``. Why is this important? Users are tied to a specific authentication provider but conversion mechanisms are available to switch a user from one authentication provider to the other. As explained in the :doc:`/user/account` section of the User Guide, a graphical workflow is provided for end users to convert from the "builtin" authentication provider to a remote provider. Conversion from a remote authentication provider to the builtin provider can be performed by a sysadmin with access to the "admin" API. See the :doc:`/api/native-api` section of the API Guide for how to list users and authentication providers as JSON. -Adding and enabling a second authentication provider (:ref:`native-api-add-auth-provider` and :ref:`api-toggle-auth-provider`) will result in the Log In page showing additional providers for your users to choose from. By default, the Log In page will show the "builtin" provider, but you can adjust this via the :ref:`conf-default-auth-provider` configuration option. Further customization can be achieved by setting :ref:`conf-allow-signup` to "false", thus preventing users from creating local accounts via the web interface. Please note that local accounts can also be created through the API by enabling the ``builtin-users`` endpoint (:ref:`:BlockedApiEndpoints`) and setting the ``BuiltinUsers.KEY`` database setting (:ref:`BuiltinUsers.KEY`). +Adding and enabling a second authentication provider (:ref:`native-api-add-auth-provider` and :ref:`api-toggle-auth-provider`) will result in the Log In page showing additional providers for your users to choose from. By default, the Log In page will show the "builtin" provider, but you can adjust this via the :ref:`conf-default-auth-provider` configuration option. Further customization can be achieved by setting :ref:`conf-allow-signup` to "false", thus preventing users from creating local accounts via the web interface. Please note that local accounts can also be created through the API by enabling the ``builtin-users`` endpoint (:ref:`:BlockedApiEndpoints`) and setting the ``:BuiltinUsersKey`` database setting (:ref:`:BuiltinUsersKey`). To configure Shibboleth see the :doc:`shibboleth` section and to configure OAuth see the :doc:`oauth2` section. @@ -989,15 +1015,18 @@ File Storage By default, a Dataverse installation stores all data files (files uploaded by end users) on the filesystem at ``/usr/local/payara6/glassfish/domains/domain1/files``. This path can vary based on answers you gave to the installer (see the :ref:`dataverse-installer` section of the Installation Guide) or afterward by reconfiguring the ``dataverse.files.\.directory`` JVM option described below. -A Dataverse installation can alternately store files in a Swift or S3-compatible object store, or on a Globus endpoint, and can now be configured to support multiple stores at once. With a multi-store configuration, the location for new files can be controlled on a per-Dataverse collection basis. - +A Dataverse installation can alternately store files in a Swift or S3-compatible object store, or on a Globus endpoint, and can now be configured to support multiple stores at once. A Dataverse installation may also be configured to reference some files (e.g. large and/or sensitive data) stored in a web or Globus accessible trusted remote store. +With a multi-store configuration, the location for new files can be controlled on a per-Dataverse collection or per-dataset basis. +:doc:`/admin/big-data-administration` provides more detail about the pros and cons of different types of storage. A Dataverse installation can be configured to allow out of band upload by setting the ``dataverse.files.\.upload-out-of-band`` JVM option to ``true``. By default, Dataverse supports uploading files via the :ref:`add-file-api`. With S3 stores, a direct upload process can be enabled to allow sending the file directly to the S3 store (without any intermediate copies on the Dataverse server). With the upload-out-of-band option enabled, it is also possible for file upload to be managed manually or via third-party tools, with the :ref:`Adding the Uploaded file to the Dataset ` API call (described in the :doc:`/developers/s3-direct-upload-api` page) used to add metadata and inform Dataverse that a new file has been added to the relevant store. -The following sections describe how to set up various types of stores and how to configure for multiple stores. +The following sections describe how to set up various types of stores and how to configure for multiple stores. See also :ref:`choose-store`. + +.. _multiple-stores: Multi-store Basics ++++++++++++++++++ @@ -1058,6 +1087,8 @@ File stores have one option - the directory where files should be stored. This c Multiple file stores should specify different directories (which would nominally be the reason to use multiple file stores), but one may share the same directory as "\-Ddataverse.files.directory" option - this would result in temp files being stored in the /temp subdirectory within the file store's root directory. +See also :ref:`file-stores`. + Swift Storage +++++++++++++ @@ -1153,6 +1184,8 @@ The Dataverse Software S3 driver supports multi-part upload for large files (ove **Note:** The Dataverse Project Team is most familiar with AWS S3, and can provide support on its usage with the Dataverse Software. Thanks to community contributions, the application's architecture also allows non-AWS S3 providers. The Dataverse Project Team can provide very limited support on these other providers. We recommend reaching out to the wider Dataverse Project Community if you have questions. +See also :ref:`s3-stores`. + First: Set Up Accounts and Access Credentials ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -1383,6 +1416,8 @@ You may provide the values for these via any `supported MicroProfile Config API 2. A non-empty ``dataverse.files..profile`` will be ignored when no credentials can be found for this profile name. Current codebase does not make use of "named profiles" as seen for AWS CLI besides credentials. +.. _s3-compatible: + Reported Working S3-Compatible Storage ###################################### @@ -1469,29 +1504,33 @@ In addition to having the type "remote" and requiring a label, Trusted Remote St These and other available options are described in the table below. Trusted remote stores can range from being a static trusted website to a sophisticated service managing access requests and logging activity -and/or managing access to a secure enclave. See :doc:`/developers/big-data-support` for additional information on how to use a trusted remote store. For specific remote stores, consult their documentation when configuring the remote store in your Dataverse installation. +and/or managing access to a secure enclave. See :doc:`/admin/big-data-administration` (specifically :ref:`remote-stores`) and :doc:`/developers/big-data-support` for additional information on how to use a trusted remote store. For specific remote stores, consult their documentation when configuring the remote store in your Dataverse installation. -Note that in the current implementation, activites where Dataverse needs access to data bytes, e.g. to create thumbnails or validate hash values at publication will fail if a remote store does not allow Dataverse access. Implementers of such trusted remote stores should consider using Dataverse's settings to disable ingest, validation of files at publication, etc. as needed. +Note that in the current implementation, activities where Dataverse needs access to data bytes, e.g. to create thumbnails or validate hash values at publication will fail if a remote store does not allow Dataverse access. Implementers of such trusted remote stores should consider using Dataverse's settings to disable ingest, validation of files at publication, etc. as needed. Once you have configured a trusted remote store, you can point your users to the :ref:`add-remote-file-api` section of the API Guide. .. table:: :align: left - =========================================== ================== ========================================================================== =================== - JVM Option Value Description Default value - =========================================== ================== ========================================================================== =================== - dataverse.files..type ``remote`` **Required** to mark this storage as remote. (none) - dataverse.files..label **Required** label to be shown in the UI for this storage. (none) - dataverse.files..base-url **Required** All files must have URLs of the form /* . (none) - dataverse.files..base-store **Required** The id of a base store (of type file, s3, or swift). (the default store) - dataverse.files..download-redirect ``true``/``false`` Enable direct download (should usually be true). ``false`` - dataverse.files..secret-key A key used to sign download requests sent to the remote store. Optional. (none) - dataverse.files..url-expiration-minutes If direct downloads and using signing: time until links expire. Optional. 60 - dataverse.files..remote-store-name A short name used in the UI to indicate where a file is located. Optional. (none) - dataverse.files..remote-store-url A url to an info page about the remote store used in the UI. Optional. (none) + ======================================================= ================== ========================================================================== =================== + JVM Option Value Description Default value + ======================================================= ================== ========================================================================== =================== + dataverse.files..type ``remote`` **Required** to mark this storage as remote. (none) + dataverse.files..label **Required** label to be shown in the UI for this storage. (none) + dataverse.files..base-url **Required** All files must have URLs of the form /* . (none) + dataverse.files..base-store **Required** The id of a base store (of type file, s3, or swift). (the default store) + dataverse.files..upload-out-of-band ``true`` **Required to be true** Dataverse does not manage file placement ``false`` + dataverse.files..download-redirect ``true``/``false`` Enable direct download (should usually be true). ``false`` + dataverse.files..secret-key A key used to sign download requests sent to the remote store. Optional. (none) + dataverse.files..public ``true``/``false`` True if the remote store does not enforce Dataverse access controls ``false`` + dataverse.files..ingestsizelimit Maximum size of files that should be ingested (none) + dataverse.files..url-expiration-minutes If direct downloads and using signing: time until links expire. Optional. 60 + dataverse.files..remote-store-name A short name used in the UI to indicate where a file is located. Optional. (none) + dataverse.files..remote-store-url A URL to an info page about the remote store used in the UI. Optional. (none) + dataverse.files..files-not-accessible-by-dataverse ``true``/``false`` True if the file is at the URL provided, false if that is a landing page ``false`` - =========================================== ================== ========================================================================== =================== + ======================================================= ================== ========================================================================== =================== .. _globus-storage: @@ -1531,6 +1570,7 @@ Once you have configured a globus store, or configured an S3 store for Globus ac for a managed store) - using a microprofile alias is recommended (none) dataverse.files..reference-endpoints-with-basepaths A comma separated list of *remote* trusted Globus endpoint id/s (none) dataverse.files..files-not-accessible-by-dataverse ``true``/``false`` Should be false for S3 Connector-based *managed* stores, true for others ``false`` + dataverse.files..public ``true``/``false`` True can be used to disable users ability restrict/embargo files ``false`` ======================================================= ================== ========================================================================== =================== @@ -2092,35 +2132,56 @@ JSON files for `Creative Commons licenses `_ are provided below. + +- :download:`licenseODbL-1.0.json <../../../../scripts/api/data/licenses/licenseODbL-1.0.json>` +- :download:`licenseODC-By-1.0.json <../../../../scripts/api/data/licenses/licenseODC-By-1.0.json>` +- :download:`licensePDDL-1.0.json <../../../../scripts/api/data/licenses/licensePDDL-1.0.json>` + Adding Software Licenses ^^^^^^^^^^^^^^^^^^^^^^^^ JSON files for software licenses are provided below. -- :download:`licenseMIT.json <../../../../scripts/api/data/licenses/licenseMIT.json>` - :download:`licenseApache-2.0.json <../../../../scripts/api/data/licenses/licenseApache-2.0.json>` +- :download:`licenseMIT.json <../../../../scripts/api/data/licenses/licenseMIT.json>` +- :download:`licenseEUPL-1.2.json <../../../../scripts/api/data/licenses/licenseEUPL-1.2.json>` Adding Country-Specific Licenses ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - :download:`licenseEtalab-2.0.json <../../../../scripts/api/data/licenses/licenseEtalab-2.0.json>` used in France (Etalab Open License 2.0, CC-BY 2.0 compliant). +- :download:`licenseOGL-UK-3.0.json <../../../../scripts/api/data/licenses/licenseOGL-UK-3.0.json>` Contributing to the Collection of Standard Licenses Above ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -If you do not find the license JSON you need above, you are encouraged to contribute it to this documentation. Following the Dataverse 6.2 release, we have standardized on the following procedure: +If you do not find the license JSON you need above, you are encouraged to contribute it to this documentation. Following the Dataverse 6.9 release, we have standardized on the following procedure: -- Look for the license at https://spdx.org/licenses/ -- ``cd scripts/api/data/licenses`` +- Look for the license at https://spdx.org/licenses/ and https://github.com/datacite/bracco/blob/main/app/spdx.js. +- ``cd scripts/api/data/licenses``. - Copy an existing license as a starting point. - Name your file using the SPDX identifier. For example, if the identifier is ``Apache-2.0``, you should name your file ``licenseApache-2.0.json``. - For the ``name`` field, use the "short identifier" from the SPDX landing page (e.g. ``Apache-2.0``). -- For the ``description`` field, use the "full name" from the SPDX landing page (e.g. ``Apache License 2.0``). -- For the ``uri`` field, we encourage you to use the same resource that DataCite uses, which is often the same as the first "Other web pages for this license" on the SPDX page for the license. When these differ, or there are other concerns about the URI DataCite uses, please reach out to the community to see if a consensus can be reached. +- For the ``shortDescription`` field, use the "full name" from the SPDX landing page (e.g. ``Apache License 2.0``) followed by a period (full-stop) (e.g. ``Apache License 2.0.``). +- For the ``uri`` field, use the same resource that DataCite uses, which is often the same as the first "Other web pages for this license" on the SPDX page for the license. Look at the ``seeAlso`` array for the license at https://github.com/datacite/bracco/blob/main/app/spdx.js to be sure. When these differ, or there are other concerns about the URI DataCite uses, please reach out to the community to see if a consensus can be reached. See :ref:`support`. - For the ``active`` field, put ``true``. - For the ``sortOrder`` field, put the next sequential number after checking previous files with ``grep sortOrder scripts/api/data/licenses/*``. +- For the ``rightsIdentifier`` field, use the "short identifier" from the SPDX landing page (e.g. ``Apache-2.0``). +- For the ``rightsIdentifierScheme`` field, use "SPDX". +- For the ``schemeUri`` field, use "https://spdx.org/licenses/". +- For the ``languageCode`` field, use "en". +- For all of the fields above, resist the urge to change the spelling of words like license/licence, center/centre, etc. SPDX is the upstream authority, and they have the following `varietal word spelling policy `_: "The words in each line of the text file available at https://spdx.org/licenses/equivalentwords.txt are considered equivalent and interchangeable." + +In the past, licenses have been added that do not adhere perfectly with the procedure above. Here are known inconsistencies: -Note that prior to Dataverse 6.2, various license above have been added that do not adhere perfectly with this procedure. For example, the ``name`` for the CC0 license is ``CC0 1.0`` (no dash) rather than ``CC0-1.0`` (with a dash). We are keeping the existing names for backward compatibility. For more on standarizing license configuration, see https://github.com/IQSS/dataverse/issues/8512 +- The ``name`` for the CC licenses don't have a dash as their SPDX short identifiers do (e.g. CC-BY-4.0, CC-BY-NC-4.0, CC-BY-NC-ND-4.0, CC-BY-NC-SA-4.0, CC-BY-ND-4.0, CC-BY-SA-4.0, CC0-1.0). For example, the ``name`` for the CC0 license is ``CC0 1.0`` (no dash) rather than ``CC0-1.0`` (with a dash). We are keeping the existing names without dashes for backward compatibility. +- The ``uri`` for Creative Commons licenses comes from the Creative Commons website rather than SPDX or DataCite. As with ``name``, we are keeping ``uri`` the as-is for these licenses for backward compatibility. For more on our attempts to standardize license configuration, see https://github.com/IQSS/dataverse/issues/8512 and https://github.com/IQSS/dataverse/pull/11522. +- The ``uri`` for Etalab is https://spdx.org/licenses/etalab-2.0 rather than a link listed in SPDX or DataCite. +- The ``shortDescription`` doesn't have a trailing period for Apache-2.0, Etalab, and MIT. Adding Custom Licenses ^^^^^^^^^^^^^^^^^^^^^^ @@ -2356,6 +2417,9 @@ The workflow id returned in this call (or available by doing a GET of /api/admin Once these steps are taken, new publication requests will automatically trigger submission of an archival copy to the specified archiver, Chronopolis' DuraCloud component in this example. For Chronopolis, as when using the API, it is currently the admin's responsibility to snap-shot the DuraCloud space and monitor the result. Failure of the workflow, (e.g. if DuraCloud is unavailable, the configuration is wrong, or the space for this dataset already exists due to a prior publication action or use of the API), will create a failure message but will not affect publication itself. +Note: setting the default workflow is also available via the Settings API. +See :ref:`:WorkflowsAdminIpWhitelist`, :ref:`:PrePublishDatasetWorkflowId` and :ref:`:PostPublishDatasetWorkflowId` + .. _bag-info.txt: Configuring bag-info.txt @@ -2476,6 +2540,28 @@ Setting Up Integrations Before going live, you might want to consider setting up integrations to make it easier for your users to deposit or explore data. See the :doc:`/admin/integrations` section of the Admin Guide for details. +.. _comma-separated-config-values: + +Comma-Separated Configuration Values +------------------------------------ + +Many configuration options (both MicroProfile/JVM settings and database settings) accept comma-separated lists. For all such settings, Dataverse applies consistent, lightweight parsing: + +- Whitespace immediately around commas is ignored (e.g., ``GET, POST`` is equivalent to ``GET,POST``). +- Tokens are otherwise preserved exactly as typed. There is no quote parsing and no escape processing. +- Embedded commas within a token are not supported. + +Examples include (but are not limited to): + +- :ref:`dataverse.cors.origin ` +- :ref:`dataverse.cors.methods ` +- :ref:`dataverse.cors.headers.allow ` +- :ref:`dataverse.cors.headers.expose ` +- :ref:`:UploadMethods` + +This behavior is implemented centrally and applies across all Dataverse settings that accept comma-separated values. + + .. _jvm-options: JVM Options @@ -2757,6 +2843,8 @@ when using it to configure your core name! Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_SOLR_PATH``. +.. _dataverse.solr.min-files-to-use-proxy: + dataverse.solr.min-files-to-use-proxy +++++++++++++++++++++++++++++++++++++ @@ -2768,6 +2856,8 @@ A recommended value would be ~1000 but the optimal value may vary depending on d Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_SOLR_MIN_FILES_TO_USE_PROXY``. +.. _dataverse.solr.concurrency.max-async-indexes: + dataverse.solr.concurrency.max-async-indexes ++++++++++++++++++++++++++++++++++++++++++++ @@ -3189,7 +3279,7 @@ dataverse.person-or-org.org-phrase-array Please note that this setting is experimental. The Schema.org metadata and OpenAIRE exports and the Schema.org metadata included in DatasetPages try to infer whether each entry in the various fields (e.g. Author, Contributor) is a Person or Organization. -If you have examples where an orgization name is being inferred to belong to a person, you can use this setting to force it to be recognized as an organization. +If you have examples where an organization name is being inferred to belong to a person, you can use this setting to force it to be recognized as an organization. The value is expected to be a comma-separated list of strings. Any name that contains one of the strings is assumed to be an organization. For example, "Project" is a word that is not otherwise associated with being an organization. Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_PERSON_OR_ORG_ORG_PHRASE_ARRAY``. @@ -3633,8 +3723,8 @@ The default value when not set is "chicago-author-date, ieee". .. _localcontexts: -localcontexts.url -+++++++++++++++++ +dataverse.localcontexts.url ++++++++++++++++++++++++++++ .. note:: For more information about LocalContexts integration, see :doc:`/installation/localcontexts`. @@ -3646,8 +3736,8 @@ The URL for the Local Contexts Hub API. Can also be set via *MicroProfile Config API* sources, e.g. the environment variable ``DATAVERSE_LOCALCONTEXTS_URL``. -localcontexts.api-key -+++++++++++++++++++++ +dataverse.localcontexts.api-key ++++++++++++++++++++++++++++++++ The API key for accessing the Local Contexts Hub. @@ -3669,7 +3759,7 @@ Experimental. See :doc:`/developers/search-services`. .. _dataverse.cors: CORS Settings -------------- ++++++++++++++ The following settings control Cross-Origin Resource Sharing (CORS) for your Dataverse installation. @@ -3678,10 +3768,9 @@ The following settings control Cross-Origin Resource Sharing (CORS) for your Dat dataverse.cors.origin +++++++++++++++++++++ -Allowed origins for CORS requests. The default with no value set is to not include CORS headers. However, if the deprecated :AllowCors setting is explicitly set to true the default is "\*" (all origins). -When the :AllowsCors setting is not used, you must set this setting to "\*" or a list of origins to enable CORS headers. +Allowed origins for CORS requests. If this setting is not defined, CORS headers are not added. Set to ``*`` to allow all origins (note that browsers will not allow credentialed requests with ``*``) or provide a comma-separated list of explicit origins. -Multiple origins can be specified as a comma-separated list. +Multiple origins can be specified as a comma-separated list (whitespace is ignored): Example: @@ -3689,6 +3778,11 @@ Example: Can also be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_CORS_ORIGIN``. +Behavior: + +* When a list of origins is configured, Dataverse echoes the single matching request ``Origin`` value in ``Access-Control-Allow-Origin`` and adds ``Vary: Origin`` to support correct proxy/CDN caching. +* When ``*`` is configured, ``Access-Control-Allow-Origin: *`` is sent and ``Vary`` is not modified. + .. _dataverse.cors.methods: dataverse.cors.methods @@ -3731,6 +3825,39 @@ Example: Can also be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_CORS_HEADERS_EXPOSE``. + +.. _dataverse.api.mdc.min-delay-ms: + +dataverse.api.mdc.min-delay-ms +++++++++++++++++++++++++++++++ + +Minimum delay in milliseconds between Make Data Count (MDC) API requests from the /api/admin/makeDataCount/{id}/updateCitationsForDataset api. +This setting helps prevent overloading the MDC service by enforcing a minimum time interval between consecutive requests. +If a request arrives before this interval has elapsed since the previous request, it will be rate-limited. + +Default: ``0`` (no delay enforced) + +Example: ``dataverse.api.mdc.min-delay-ms=100`` (enforces a minimum 100ms delay between MDC API requests) + +Can also be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_API_MDC_MIN_DELAY_MS``. + +.. dataverse.ldn + +Linked Data Notifications (LDN) Allowed Hosts ++++++++++++++++++++++++++++++++++++++++++++++ + +Dataverse supports receiving LDN notifications via the /api/inbox endpoint. The dataverse.ldn.allowed-hosts allows you to specify the list of host IP addresses from which LDN notifications can be received, or ``*`` to receive messages from anywhere. + +Example: ``dataverse.ldn.allowed-hosts=*`` + +COAR Notify Relationship Announcement Notify Superusers Only +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +When Dataverse receives an LDN message conforming to the COAR Notify Relationship Announcement format and the message is about a dataset hosted in the installation, Dataverse will send an notification to users who have permission to publish the dataset. +This can instead be restricted to only superusers who can publish the dataset using this option. + +Example: ``dataverse.coar-notify.relationship-announcement.notify-superusers-only=true`` + .. _feature-flags: Feature Flags @@ -3805,6 +3932,12 @@ please find all known feature flags below. Any of these flags can be activated u * - enable-pid-failure-log - Turns on creation of a monthly log file (logs/PIDFailures_.log) showing failed requests for dataset/file PIDs. Can be used directly or with scripts at https://github.com/gdcc/dataverse-recipes/python/pid_reports to alert admins. - ``Off`` + * - role-assignment-history + - Turns on tracking/display of role assignments and revocations for collections, datasets, and files + - ``Off`` + * - only-update-datacite-when-needed + - Only contact DataCite to update a DOI after checking to see if DataCite has outdated information (for efficiency, lighter load on DataCite, especially when using file DOIs). + - ``Off`` **Note:** Feature flags can be set via any `supported MicroProfile Config API source`_, e.g. the environment variable ``DATAVERSE_FEATURE_XXX`` (e.g. ``DATAVERSE_FEATURE_API_SESSION_AUTH=1``). These environment variables can be set in your shell before starting Payara. If you are using :doc:`Docker for development `, you can set them in the `docker compose `_ file. @@ -3873,11 +4006,14 @@ You might also create your own profiles and use these, please refer to the upstr Database Settings ----------------- -These settings are stored in the ``setting`` database table but can be read and modified via the "admin" endpoint of the :doc:`/api/native-api` for easy scripting. +These settings are stored in the ``setting`` database table but we recommend using the Settings Admin API (:ref:`admin-api-db-settings`) to view and modify them, as shown below. +If changed in the database directly, you need to reload the application to make the ORM pickup the changes. -The most commonly used configuration options are listed first. +In short: -The pattern you will observe in curl examples below is that an HTTP ``PUT`` is used to add or modify a setting. If you perform an HTTP ``GET`` (the default when using curl), the output will contain the value of the setting, if it has been set. You can also do a ``GET`` of all settings with ``curl http://localhost:8080/api/admin/settings`` which you may want to pretty-print by piping the output through a tool such as jq by appending ``| jq .``. If you want to remove a setting, use an HTTP ``DELETE`` such as ``curl -X DELETE http://localhost:8080/api/admin/settings/:GuidesBaseUrl`` . +- HTTP ``GET`` is used to show settings. +- HTTP ``PUT`` is used to add or modify settings. +- HTTP ``DELETE`` is used to delete settings. .. _:BlockedApiPolicy: @@ -3933,14 +4069,16 @@ Now that ``:BlockedApiKey`` has been enabled, blocked APIs can be accessed using ``curl https://demo.dataverse.org/api/admin/settings?unblock-key=theKeyYouChose`` -.. _BuiltinUsers.KEY: +.. _:BuiltinUsersKey: -BuiltinUsers.KEY +:BuiltinUsersKey ++++++++++++++++ The key required to create users via API as documented at :doc:`/api/native-api`. Unlike other database settings, this one doesn't start with a colon. -``curl -X PUT -d builtInS3kretKey http://localhost:8080/api/admin/settings/BuiltinUsers.KEY`` +``curl -X PUT -d builtInS3kretKey http://localhost:8080/api/admin/settings/:BuiltinUsersKey`` + +Note: this key used to be named ``BuiltinUsers.KEY`` until Dataverse 6.8. :SearchApiRequiresToken +++++++++++++++++++++++ @@ -4352,6 +4490,8 @@ Notes: - For larger file upload sizes, you may need to configure your reverse proxy timeout. If using apache2 (httpd) with Shibboleth, add a timeout to the ProxyPass defined in etc/httpd/conf.d/ssl.conf (which is described in the :doc:`/installation/shibboleth` setup). +.. _:MultipleUploadFilesLimit: + :MultipleUploadFilesLimit +++++++++++++++++++++++++ @@ -4370,33 +4510,71 @@ For performance reasons, your Dataverse installation will only allow creation of In the UI, users trying to download a zip file larger than the Dataverse installation's :ZipDownloadLimit will receive messaging that the zip file is too large, and the user will be presented with alternate access options. +.. _:TabularIngestSizeLimit: + :TabularIngestSizeLimit +++++++++++++++++++++++ -Threshold in bytes for limiting whether or not "ingest" it attempted for tabular files (which can be resource intensive). For example, with the below in place, files greater than 2 GB in size will not go through the ingest process: +Threshold in bytes for limiting whether or not "ingest" is attempted for an uploaded tabular file (which can be resource intensive). +For more on the ingest feature, see :doc:`/user/tabulardataingest/index` in the User Guide. + +There are two ways to specify ingest size limits. You can set a global limit for all file types or you can use a JSON file for more granularity. We'll cover the global limit first. + +With the following value in place (again, expressed in bytes), files greater than 2 GB in size will not go through the ingest process: ``curl -X PUT -d 2000000000 http://localhost:8080/api/admin/settings/:TabularIngestSizeLimit`` -(You can set this value to 0 to prevent files from being ingested at all.) +You can set this value to ``0`` to prevent files from being ingested at all. + +Out of the box, the ``:TabularIngestSizeLimit`` setting is absent, which results in ingest being attempted no matter how large the file is. You can specify this "no size limit" default explicitly with the value ``-1``. -You can override this global setting on a per-format basis for the following formats: +Using a JSON-based setting, you can set a global default and per-format limits for the following formats: +- CSV - DTA - POR -- SAV - Rdata -- CSV -- XLSX (in lower-case) +- SAV +- XLSX -For example : +(In previous releases of Dataverse, a colon-separated form was used to specify per-format limits, such as ``:TabularIngestSizeLimit:Rdata``, but this is no longer supported. Now JSON is used.) -* if you want your Dataverse installation to not attempt to ingest Rdata files larger than 1 MB, use this setting: +The expected JSON is an object with key/value pairs like the following. +Format names are case-insensitive, and all fields are optional (an empty JSON object equals not restricted). +The size limits must be whole numbers, either presented as strings with double quotes around them (e.g. ``"10"``) or numeric values (e.g. ``10`` or ``10.0``). +Note that decimal numbers like ``10.5`` are invalid. +Any invalid setting will temporarily disable tabular ingest until corrected. -``curl -X PUT -d 1000000 http://localhost:8080/api/admin/settings/:TabularIngestSizeLimit:Rdata`` +.. code:: json -* if you want your Dataverse installation to not attempt to ingest XLSX files at all, use this setting: + { + "default": "-1", + "csv": "0", + "dta": "10", + "por": "100" + } + +Whatever JSON you send will overwrite existing values. If you have any exiting ``:TabularIngestSizeLimit`` settings, you can use the following command to see them in the expected input format above (and then add the new settings you want): + +``curl http://localhost:8080/api/admin/settings/:TabularIngestSizeLimit | jq -r '.data.message'`` + +The ``default`` key is optional and can be used to give limits to formats that are not specified in the JSON. If you omit the ``default`` key or set it to ``"-1"``, no limits are applied to formats not specified in the JSON. If you set it to ``"0"``, ingest will be disabled (but you can override this per-format). + +Add a format name (``csv``, ``dta``, etc., as listed above) to change the limit for that particular format. + +Examples: -``curl -X PUT -d 0 http://localhost:8080/api/admin/settings/:TabularIngestSizeLimit:xlsx`` +1. If you want your Dataverse installation to not attempt to ingest Rdata files larger than 1 MB but otherwise be unlimited: + + ``curl -X PUT -d '{"Rdata":"1000000"}' http://localhost:8080/api/admin/settings/:TabularIngestSizeLimit`` +2. If you want your Dataverse installation to not attempt to ingest XLSX files at all and apply a global limit of 512 MiB, use this setting: + + ``curl -X PUT -d '{"default":"536870912", "XSLX":"0"}' http://localhost:8080/api/admin/settings/:TabularIngestSizeLimit`` +3. If you want your Dataverse installation to not attempt to ingest files at all except for CSV files that are 256 MiB or smaller, use this setting: + + ``curl -X PUT -d '{"default":"0", "CSV":"268435456"}' http://localhost:8080/api/admin/settings/:TabularIngestSizeLimit`` + +.. _:ZipUploadFilesLimit: :ZipUploadFilesLimit ++++++++++++++++++++ @@ -4416,6 +4594,8 @@ By default your Dataverse installation will attempt to connect to Solr on port 8 **Note:** instead of using a database setting, you could alternatively use JVM settings like :ref:`dataverse.solr.host`. +.. _:SolrFullTextIndexing: + :SolrFullTextIndexing +++++++++++++++++++++ @@ -4423,6 +4603,8 @@ Whether or not to index the content of files such as PDFs. The default is false. ``curl -X PUT -d true http://localhost:8080/api/admin/settings/:SolrFullTextIndexing`` +.. _:SolrMaxFileSizeForFullTextIndexing: + :SolrMaxFileSizeForFullTextIndexing +++++++++++++++++++++++++++++++++++ @@ -4444,12 +4626,15 @@ To enable the setting:: curl -X PUT -d true "http://localhost:8080/api/admin/settings/:DisableSolrFacets" +.. _:DisableSolrFacetsForGuestUsers: :DisableSolrFacetsForGuestUsers +++++++++++++++++++++++++++++++ Similar to the above, but will disable the facets for Guest (unauthenticated) users only. +.. _:DisableSolrFacetsWithoutJsession: + :DisableSolrFacetsWithoutJsession +++++++++++++++++++++++++++++++++ @@ -4943,20 +5128,6 @@ This can be helpful in situations where multiple organizations are sharing one D or ``curl -X PUT -d '*' http://localhost:8080/api/admin/settings/:InheritParentRoleAssignments`` -:AllowCors (Deprecated) -+++++++++++++++++++++++ - -.. note:: - This setting is deprecated. Please use the JVM settings above instead. - This legacy setting will only be used if the newer JVM settings are not set. - -Enable or disable support for Cross-Origin Resource Sharing (CORS) by setting ``:AllowCors`` to ``true`` or ``false``. - -``curl -X PUT -d true http://localhost:8080/api/admin/settings/:AllowCors`` - -.. note:: - New values for this setting will only be used after a server restart. - :ChronologicalDateFacets ++++++++++++++++++++++++ @@ -4966,6 +5137,8 @@ If you don’t want date facets to be sorted chronologically, set: ``curl -X PUT -d 'false' http://localhost:8080/api/admin/settings/:ChronologicalDateFacets`` +.. _:CustomZipDownloadServiceUrl: + :CustomZipDownloadServiceUrl ++++++++++++++++++++++++++++ @@ -4989,6 +5162,43 @@ Number of errors to display to the user when creating DataFiles from a file uplo ``curl -X PUT -d '1' http://localhost:8080/api/admin/settings/:CreateDataFilesMaxErrorsToDisplay`` +.. _:WorkflowsAdminIpWhitelist: + +:WorkflowsAdminIpWhitelist +++++++++++++++++++++++++++ + +A semicolon-separated list of IP addresses from which workflow resume requests are honored. +By default, the Dataverse installation honors resume requests from localhost only (``127.0.0.1;::1``). +This setting allows for preventing unauthorized resuming of workflows. + +``curl -X PUT -d '127.0.0.1;::1;192.168.0.1' http://localhost:8080/api/admin/settings/:WorkflowsAdminIpWhitelist`` + +See :ref:`Workflow Admin section ` for more details and context. + +.. _:PrePublishDatasetWorkflowId: + +:PrePublishDatasetWorkflowId +++++++++++++++++++++++++++++ + +The identifier of the workflow to be executed prior to dataset publication. +This pre-publish workflow is useful for preparing a dataset for public access (e.g., moving files, checking metadata) or starting an approval process. + +``curl -X PUT -d '1' http://localhost:8080/api/admin/settings/:PrePublishDatasetWorkflowId`` + +See :ref:`Workflow Admin section ` for more details and context. + +.. _:PostPublishDatasetWorkflowId: + +:PostPublishDatasetWorkflowId ++++++++++++++++++++++++++++++ + +The identifier of the workflow to be executed after a dataset has been successfully published. +This post-publish workflow is useful for actions such as sending notifications about the newly published dataset or archiving. + +``curl -X PUT -d '2' http://localhost:8080/api/admin/settings/:PostPublishDatasetWorkflowId`` + +See :ref:`Workflow Admin section ` for more details and context. + .. _:BagItHandlerEnabled: :BagItHandlerEnabled @@ -5097,6 +5307,8 @@ A suggested minimum includes author, datasetContact, and contributor, but additi ``curl -X PUT -d 'author, datasetContact, contributor, depositor, grantNumber, publication' http://localhost:8080/api/admin/settings/:AnonymizedFieldTypeNames`` +.. _:DatasetChecksumValidationSizeLimit: + :DatasetChecksumValidationSizeLimit +++++++++++++++++++++++++++++++++++ @@ -5112,6 +5324,8 @@ Refer to "Physical Files Validation in a Dataset" API :ref:`dataset-files-valida Also refer to the "Datafile Integrity" API :ref:`datafile-integrity` +.. _:DataFileChecksumValidationSizeLimit: + :DataFileChecksumValidationSizeLimit ++++++++++++++++++++++++++++++++++++ @@ -5373,6 +5587,8 @@ To use the current GDCC version directly: ``curl -X PUT -d 'https://gdcc.github.io/dvwebloader/src/dvwebloader.html' http://localhost:8080/api/admin/settings/:WebloaderUrl`` +.. _:CategoryOrder: + :CategoryOrder ++++++++++++++ @@ -5382,6 +5598,8 @@ The default is category ordering disabled. ``curl -X PUT -d 'Documentation,Data,Code' http://localhost:8080/api/admin/settings/:CategoryOrder`` +.. _:OrderByFolder: + :OrderByFolder ++++++++++++++ diff --git a/doc/sphinx-guides/source/installation/intro.rst b/doc/sphinx-guides/source/installation/intro.rst index 1c239863e98..e13c22824b7 100644 --- a/doc/sphinx-guides/source/installation/intro.rst +++ b/doc/sphinx-guides/source/installation/intro.rst @@ -17,16 +17,18 @@ Jump ahead to :doc:`config` or :doc:`upgrading` for an existing Dataverse instal Intended Audience ----------------- -This guide is intended primarily for sysadmins who are installing, configuring, and upgrading a Dataverse installation. +This guide is intended primarily for sysadmins who are installing, configuring, and upgrading a Dataverse installation. This guide was written with non-Docker installation in mind but if you're interested in Docker, see the :doc:`/container/index`. Sysadmins are expected to be comfortable using standard Linux commands, issuing ``curl`` commands, and running SQL scripts. Related Guides -------------- +See the :doc:`/container/index` if you want to run Dataverse in Docker. + Many "admin" functions can be performed by Dataverse installation users themselves (non-superusers) as documented in the :doc:`/user/index` and that guide is a good introduction to the features of the Dataverse Software from an end user perspective. -If you are a sysadmin who likes to code, you may find the :doc:`/api/index` useful, and you may want to consider improving the installation script or hacking on the community-lead configuration management options mentioned in the :doc:`prep` section. If you **really** like to code and want to help with the Dataverse Software code, please check out the :doc:`/developers/index`! +If you are a sysadmin who likes to code, you may find the :doc:`/api/index` useful, and you may want to consider improving the installation script or hacking on the community-lead configuration management options mentioned in the :doc:`prep` section. If you **really** like to code and want to help with the Dataverse code or documentation, please check out the :doc:`/contributor/index` and the :doc:`/developers/index`! .. _support: diff --git a/doc/sphinx-guides/source/installation/localcontexts.rst b/doc/sphinx-guides/source/installation/localcontexts.rst index 174b2d0ac94..2bafc2524d9 100644 --- a/doc/sphinx-guides/source/installation/localcontexts.rst +++ b/doc/sphinx-guides/source/installation/localcontexts.rst @@ -18,18 +18,16 @@ Configuration There are several steps to LocalContexts integration. -First, you need to configure the LOCALCONTEXTS_URL and LOCALCONTEXTS_API_KEY as described in the :ref:`localcontexts` section of the Configuration Guide. -API Keys are available to Local Contexts Integration Partners - see https://localcontexts.org/hub-agreements/integration-partners/ for details. - -Next, you should add the Local Contexts metadatablock and configure the associated external vocabulary script. -The metadatablock contains one field allowing Dataverse to store the URL of an associated Local Contexts Hub project. -The external vocabulary script interacts with the Local Contexts Hub (via the Dataverse server) to display the Labels and Notices associated with the proect and provide a link to it. -The script also supports adding/removing such a link from the dataset's metadata. Note that only a project that references the dataset in its `publication_doi` field can be linked to a dataset. -See https://github.com/gdcc/dataverse-external-vocab-support/blob/main/packages/local_contexts/README.md for details on these steps. - -Lastly, if you wish the Local Contexts information to be shown in the summary section of the dataset page, as shown in the image above, you should add `LCProjectUrl` to list of custom summary fields via use of the :ref:`:CustomDatasetSummaryFields` setting. - -Optionally, one could also set the dataverse.feature.add-local-contexts-permission-check FeatureFlag to true. This assures that only users editing datasets can use the LocalContexts search functionality. -However, as this currently would also require setting the dataverse.feature.api-session-auth, the security implications of which haven't been fully explored, it is not recommended unless problematic use is seen. -(When API access via OpenIdConnect is available, use of api-session-auth would not be required.) +- A Local Contexts Hub Institutional or Integration Partner account is required. See https://localcontexts.org/hub-agreements for more information and the associated costs. + (Institutions may wish to connect their Institutional accounts with `The Dataverse Project Integration Partner `_ rather than having their own Integration Partner account.) + (Free accounts for testing can be created at https://sandbox.localcontextshub.org/.) +- Create an API key in your Local Contexts account. +- Configure the DATAVERSE_LOCALCONTEXTS_URL and DATAVERSE_LOCALCONTEXTS_API_KEY as described in the :ref:`localcontexts` section of the Configuration Guide. +- Add the Local Contexts metadatablock and configure the associated external vocabulary script. Both are available, along with installation instructions, in the `Dataverse External Vocabulary GitHub Repository `_. + The metadatablock contains one field allowing Dataverse to store the URL of an associated Local Contexts Hub project. Be sure to update the Solr schema after installing the metadatablock (see :ref:`update-solr-schema`). + The external vocabulary script interacts with the Local Contexts Hub (via the Dataverse server) to display the Labels and Notices associated with the proect and provide a link to it. + The script also supports adding/removing such a link from the dataset's metadata. Note that only a project that references the dataset's PID in its `Optional Project Information` field can be linked to a dataset. +- Lastly, to show Local Contexts information in the summary section of the dataset page, as shown in the image above, you should add `LCProjectUrl` to list of custom summary fields via use of the :ref:`:CustomDatasetSummaryFields` setting. +- Optionally, one can also set the dataverse.feature.add-local-contexts-permission-check FeatureFlag to true. This assures that only users editing datasets can use the LocalContexts search functionality (e.g. via API). + This is not recommended unless problematic use is seen. diff --git a/doc/sphinx-guides/source/installation/prep.rst b/doc/sphinx-guides/source/installation/prep.rst index abb4349d3ad..9f8efe84c01 100644 --- a/doc/sphinx-guides/source/installation/prep.rst +++ b/doc/sphinx-guides/source/installation/prep.rst @@ -19,6 +19,13 @@ Standard Installation Installing the Dataverse Software involves some system configuration followed by executing an installation script that will guide you through the installation process as described in :doc:`installation-main`, but reading about the :ref:`architecture` of the Dataverse Software is recommended first. +.. _docker-installation: + +Docker Installation ++++++++++++++++++++ + +See the :doc:`/container/index`, especially :doc:`/container/running/index` and :doc:`/container/running/production`. + .. _advanced: Advanced Installation @@ -26,13 +33,13 @@ Advanced Installation There are some community-lead projects to use configuration management tools such as Ansible and Puppet to automate the installation and configuration of the Dataverse Software, but support for these solutions is limited to what the Dataverse Community can offer as described in each project's webpage: -- https://github.com/GlobalDataverseCommunityConsortium/dataverse-ansible +- https://github.com/gdcc/dataverse-ansible - https://gitlab.com/lip-computing/dataverse - https://github.com/IQSS/dataverse-puppet (Please note that the "dataverse-ansible" repo is used in a script that allows the Dataverse Software to be installed on Amazon Web Services (AWS) from arbitrary GitHub branches as described in the :doc:`/developers/deployment` section of the Developer Guide.) -The Dataverse Project team is happy to "bless" additional community efforts along these lines (i.e. Docker, Chef, Salt, etc.) by creating a repo under https://github.com/gdcc and managing team access. +The Dataverse Project team is happy to "bless" additional community efforts along these lines (i.e. Chef, Salt, etc.) by creating a repo under https://github.com/gdcc and managing team access. The Dataverse Software permits a fair amount of flexibility in where you choose to install the various components. The diagram below shows a load balancer, multiple proxies and web servers, redundant database servers, and offloading of potentially resource intensive work to a separate server. (Glassfish is shown rather than Payara.) @@ -110,7 +117,7 @@ Decisions to Make Here are some questions to keep in the back of your mind as you test and move into production: -- How much storage do I need? +- How much storage do I need? What is the scale of data I will need to handle (see :doc:`/admin/big-data-administration`)? - Which features do I want based on :ref:`architecture`? - How do I want my users to log in to the Dataverse installation? With local accounts? With Shibboleth/SAML? With OAuth providers such as ORCID, GitHub, or Google? - Do I want to to run my app server on the standard web ports (80 and 443) or do I want to "front" my app server with a proxy such as Apache or nginx? See "Network Ports" in the :doc:`config` section. diff --git a/doc/sphinx-guides/source/installation/prerequisites.rst b/doc/sphinx-guides/source/installation/prerequisites.rst index a5aacc4701c..70a1c618fe5 100644 --- a/doc/sphinx-guides/source/installation/prerequisites.rst +++ b/doc/sphinx-guides/source/installation/prerequisites.rst @@ -44,7 +44,7 @@ On RHEL/derivative you can make Java 17 the default with the ``alternatives`` co Payara ------ -Payara 6.2025.3 is recommended. Newer versions might work fine. Regular updates are recommended. +Payara 6.2025.10 is recommended. Newer versions might work fine. Regular updates are recommended. Installing Payara ================= @@ -55,8 +55,8 @@ Installing Payara - Download and install Payara (installed in ``/usr/local/payara6`` in the example commands below):: - # wget https://nexus.payara.fish/repository/payara-community/fish/payara/distributions/payara/6.2025.3/payara-6.2025.3.zip - # unzip payara-6.2025.3.zip + # wget https://nexus.payara.fish/repository/payara-community/fish/payara/distributions/payara/6.2025.10/payara-6.2025.10.zip + # unzip payara-6.2025.10.zip # mv payara6 /usr/local If nexus.payara.fish is ever down for maintenance, Payara distributions are also available from https://repo1.maven.org/maven2/fish/payara/distributions/payara/ diff --git a/doc/sphinx-guides/source/qa/test-automation.md b/doc/sphinx-guides/source/qa/test-automation.md index fa995bcaafd..a9f40a7dab2 100644 --- a/doc/sphinx-guides/source/qa/test-automation.md +++ b/doc/sphinx-guides/source/qa/test-automation.md @@ -52,7 +52,7 @@ Go to the end of the log and then scroll up, looking for the failure. A failed A ``` TASK [dataverse : download payara zip] ***************************************** -fatal: [localhost]: FAILED! => {"changed": false, "dest": "/tmp/payara.zip", "elapsed": 10, "msg": "Request failed: ", "url": "https://nexus.payara.fish/repository/payara-community/fish/payara/distributions/payara/6.2025.3/payara-6.2025.3.zip"} +fatal: [localhost]: FAILED! => {"changed": false, "dest": "/tmp/payara.zip", "elapsed": 10, "msg": "Request failed: ", "url": "https://nexus.payara.fish/repository/payara-community/fish/payara/distributions/payara/6.2025.10/payara-6.2025.10.zip"} ``` In the example above, if Payara can't be downloaded, we're obviously going to have problems deploying Dataverse to it! diff --git a/doc/sphinx-guides/source/qa/testing-approach.md b/doc/sphinx-guides/source/qa/testing-approach.md index 817161d02a0..49b9075cf7f 100644 --- a/doc/sphinx-guides/source/qa/testing-approach.md +++ b/doc/sphinx-guides/source/qa/testing-approach.md @@ -34,7 +34,7 @@ Think about risk. Is the feature or function part of a critical area such as per ## Smoke Test -1. Go to the homepage on . Scroll to the bottom to ensure the build number is the one you intend to test from Jenkins. +1. Go to the homepage on (this server has production data). Scroll to the bottom to ensure the build number is the one you intend to test from Jenkins. 1. Create a new user: It's fine to use a formulaic name with your initials and date and make the username and password the same, eg. kc080622. 1. Create a dataverse: You can use the same username. 1. Create a dataset: You can use the same username; fill in the required fields (do not use a template). diff --git a/doc/sphinx-guides/source/quickstart/index.md b/doc/sphinx-guides/source/quickstart/index.md new file mode 100644 index 00000000000..96f3fdbeff5 --- /dev/null +++ b/doc/sphinx-guides/source/quickstart/index.md @@ -0,0 +1,9 @@ +# Quickstart Guide + +```{toctree} +:caption: "Contents:" +:maxdepth: 1 +what-is-dataverse.md +publish-a-dataset.md +publish-a-collection.md +``` diff --git a/doc/sphinx-guides/source/quickstart/publish-a-collection.md b/doc/sphinx-guides/source/quickstart/publish-a-collection.md new file mode 100644 index 00000000000..4b97703225b --- /dev/null +++ b/doc/sphinx-guides/source/quickstart/publish-a-collection.md @@ -0,0 +1,63 @@ +# Publish a Collection + +## 🔐 Step 1: Log In & Create a Draft + +- {ref}`Log in `. +- (Optional) Navigate to {doc}`a collection `. +- Click "Add Data" → "New Dataverse". + +Note: If you don’t see the "Add Data" button, contact your repository support team. + +## 📝 Step 2: Enter Basic Metadata & Settings + +- Fill in the required {ref}`metadata ` fields +- Select metadata settings. +- Click "Create Dataverse" at the bottom to save your draft collection. + +## 🚀 Step 3: Publish Your Collection + +Note: once published, easy deletion of a collection is no longer possible. + +- Click "Publish" (top right). + +--- + +## ✅ Choose Look & Feel (optional) + +- Click "Edit" → "Theme \+ Widgets". +- Select {ref}`theme settings `. +- Click "Save Changes". + +## ✅ Set Permissions (optional) + +- Click "Edit" → "Permissions". +- Under "Permissions", click "Edit Access" to {ref}`set general permissions `. +- (Optional) Add users or groups with specific permissions under "Users/Groups" by clicking "Assign Roles to Users/Groups". + +## ✅ Create Groups (optional) + +- Click "Edit" → "Groups". +- Click "Create Group". +- Enter a Group Name and Group Identifier. +- Add users to this group using autofill in "Users/Groups". +- Click "Create Group". + +## ✅ Set Dataset Templates (optional) + +- Click "Edit" → "Dataset Templates". +- Click {ref}`Create Dataset Template `. +- Enter a template name. +- Add any template metadata in the metadata fields. +- Click "Save \+ Add Terms". +- {ref}`Choose a license ` from the dropdown or select {ref}`Custom Dataset Terms `. +- Provide other relevant terms information +- Click "Save Dataset Template" + +## ✅ Set Dataset Guestbooks (optional) + +- Click "Edit" → "Dataset Guestbooks". +- Click {ref}`Create Dataset Guestbook `. +- Enter a guestbook name. +- Add information to collect. +- Click "Create Dataset Guestbook". + diff --git a/doc/sphinx-guides/source/quickstart/publish-a-dataset.md b/doc/sphinx-guides/source/quickstart/publish-a-dataset.md new file mode 100644 index 00000000000..91e0da5f93d --- /dev/null +++ b/doc/sphinx-guides/source/quickstart/publish-a-dataset.md @@ -0,0 +1,61 @@ +# Publish a Dataset + +## 🔐 Step 1: Log In & Create a Draft + +- {ref}`Log in `. +- (Optional) Navigate to {doc}`a collection `. +- Click "Add Data" → "{ref}`New Dataset `". + +Note: If you don’t see the "Add Data" button, contact your repository support team. + +## 📝Step 2: Enter Basic Metadata + +- Fill in the required {ref}`metadata ` fields. +- Click "Save" at the bottom to save your draft dataset. + + +## 📁 Step 3: Upload or Edit Files + +- In the draft dataset, scroll down to the "Files" tab. +- Click "{ref}`Upload Files `". +- Choose "Select Files to Add" or drag and drop files. +- (Optional) Use "{ref}`Upload Folder `" if available. +- Click "Done" when upload is completed. + +To edit files: + +- Select files on the left → Click "{ref}`Edit Files `". +- To {ref}`restrict/embargo files `, choose the relevant option. + +## 📜 Step 4: Set Terms of Use + +- Go to the terms tab. +- Click "{ref}`Edit Terms Requirements `". +- {ref}`Choose a license ` from the dropdown or select {ref}`Custom Dataset Terms `. +- Click "Save changes". + +## 🧾 Step 5: Add or Edit Metadata + +- Go to the metadata tab. +- Click "Add + Edit Metadata". +- Click "Save" after making changes. + +## 🚀 Step 6: Publish Your Dataset + +Note: once published, easy deletion of a dataset is no longer possible. + +- Click publish-dataset or submit-for-review (top right). + + - {ref}`Publish Dataset `: Immediately publish a dataset. This option is only available on repositories without a review phase. + + - {ref}`Submit for Review `: Locks draft and sends to support staff. If changes are needed, you’ll be notified via email and you can resubmit. This option is only available on repositories with a review phase. + +## 🔄 Optional: Update a Published Dataset + +- Edit the dataset and publish {ref}`a new version `. +- The DOI remains the same. +- The dataset-versioning tracks all changes: + + - Metadata changes = minor version. + + - Data changes = major version. diff --git a/doc/sphinx-guides/source/quickstart/what-is-dataverse.md b/doc/sphinx-guides/source/quickstart/what-is-dataverse.md new file mode 100644 index 00000000000..6f86473bada --- /dev/null +++ b/doc/sphinx-guides/source/quickstart/what-is-dataverse.md @@ -0,0 +1,40 @@ +# What is Dataverse? + +Dataverse is an open source web application for sharing, preserving, citing, exploring, and analyzing research data. It is developed and supported by the Dataverse user community. + +A Dataverse repository can host one or more Dataverse collections, which organize datasets. + +- Collections can contain datasets and sub-collections for further organization. +- Each dataset includes: + - Metadata + - Data files + - Documentation or code + +## Core Capabilities + +### 📤 Upload, manage, publish and download data files. + +- Upload data while retaining directory structure for better context and reproducibility. +- Manage datasets by inviting collaborators before publication. +- Control access with permissions, configurations, licenses, file restrictions, and guestbooks. +- Publish datasets with rich metadata, licensing, and versioning to make data FAIR. +- Download data with clear terms of use and cite data using provided citation options. + +### 🔍 Make your data findable, reusable, and citable with rich metadata + +- Rich metadata is added to a dataset before publication, with the option to use domain-specific metadata blocks. +- Harvest metadata of datasets through the distribution of standardized data descriptions across the web (e.g. Google Dataset Search). + +### 📜Define how data can be reused with clear terms + +- Standardized licenses can be applied to a dataset. +- Custom dataset terms allow for dataset publication that cannot use standardized licenses. + +### 📈 Keep track of changes on published datasets with versioning + +- Track version changes with metadata and file changes tracked as minor and major versions. +- Access and cite specific versions via the version tab on a dataset. +- Compare versions with the detailed version change overview on dataset-level. + +### ✨More features +The Dataverse project is continuously evolving. For an overview of capabilities, visit the [features list](https://dataverse.org/software-features). diff --git a/doc/sphinx-guides/source/user/account.rst b/doc/sphinx-guides/source/user/account.rst index cd269ddac20..17415a59190 100755 --- a/doc/sphinx-guides/source/user/account.rst +++ b/doc/sphinx-guides/source/user/account.rst @@ -14,6 +14,8 @@ As a registered user, you can: - Contribute to existing datasets, if permitted. - Request access to restricted files, if permitted. +.. _account-log-in-options: + Account Log In Options ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/sphinx-guides/source/user/dataset-management.rst b/doc/sphinx-guides/source/user/dataset-management.rst index d73459969ce..22e72a6a210 100755 --- a/doc/sphinx-guides/source/user/dataset-management.rst +++ b/doc/sphinx-guides/source/user/dataset-management.rst @@ -8,6 +8,7 @@ A dataset in a Dataverse installation is a container for your data, documentatio .. contents:: |toctitle| :local: +.. _metadata-supported: Supported Metadata ================== @@ -175,6 +176,9 @@ File Previews Dataverse installations can add previewers for common file types uploaded by their research communities. The previews appear on the file page. If a preview tool for a specific file type is available, the preview will be created and will display automatically, after terms have been agreed to or a guestbook entry has been made, if necessary. File previews are not available for restricted files unless they are being accessed using a Preview URL. See also :ref:`previewUrl`. When the dataset license is not the default license, users will be prompted to accept the license/data use agreement before the preview is shown. See also :ref:`license-terms`. +.. note:: + Some previewers run purely in the browser and make direct (JavaScript) requests back to the Dataverse API endpoints to retrieve file contents, metadata, or signed URLs. For these previewers to function when hosted on a different origin (e.g., a CDN or a separate previewer service), the Dataverse installation must have CORS enabled via :ref:`dataverse.cors.origin `. Administrators should configure the list of allowed origins to include the host serving the previewers. + Previewers are available for the following file types: - Text @@ -388,7 +392,8 @@ If the bounding box was successfully populated, :ref:`geospatial-search` should Compressed Files ---------------- -Compressed files in .zip format are unpacked automatically. If a .zip file fails to unpack for whatever reason, it will upload as is. If the number of files inside are more than a set limit (1,000 by default, configurable by the Administrator), you will get an error message and the .zip file will upload as is. +Depending on the configuration, compressed files in .zip format are unpacked automatically. If a .zip file is not unpacked, it will upload as is. +If the number of files inside are more than a set limit (1,000 by default, configurable by the Administrator), you will get an error message and the .zip file will upload as is. If the uploaded .zip file contains a folder structure, the Dataverse installation will keep track of this structure. A file's location within this folder structure is displayed in the file metadata as the File Path. When you download the contents of the dataset, this folder structure will be preserved and files will appear in their original locations. @@ -396,6 +401,8 @@ These folder names are subject to strict validation rules. Only the following ch .. note:: If you upload multiple .zip files to one dataset, any subdirectories that are identical across multiple .zips will be merged together when the user downloads the full dataset. +If a .zip file is not unpacked and Zip Previewer is installed (see :ref:`file-previews`), it will be possible for users to view the contents of the zip file and to download individual files from within the .zip. + Other File Types ---------------- @@ -415,9 +422,13 @@ Differentially Private (DP) Metadata can be accessed for restricted tabular file See also :ref:`terms-of-access` and :ref:`permissions`. +.. _edit-files: + Edit Files ========== +.. _edit-file-metadata: + Edit File Metadata ------------------ @@ -470,6 +481,8 @@ Terms Dataset terms can be viewed and edited from the Terms tab of the dataset page, or under the Edit dropdown button of a Dataset. There, you can set up how users can use your data once they have downloaded it (via a standard license or, if allowed, custom terms), how they can access your data if you have files that are restricted (terms of access), and enable a Guestbook for your dataset so that you can track who is using your data and for what purposes. These are explained in further detail below: +.. _choosing-license: + Choosing a License ------------------ @@ -500,6 +513,8 @@ The `Dataverse Community Norms `_ are not a substitute for the CC0 waiver or custom terms and licenses applicable to each dataset. The Community Norms are not a binding contractual agreement, and downloading datasets from a Dataverse installation does not create a legal obligation to follow these policies. +.. _custom-terms: + Custom Terms of Use for Datasets -------------------------------- @@ -682,6 +697,8 @@ Adding Widgets to an OpenScholar Website #. Click on the Settings Cog and select Layout #. At the top right, select Add New Widget and under Misc. you will see the Dataverse Collection and the Dataverse Dataset Citation Widgets. Click on the widget you would like to add, fill out the form, and then drag it to where you would like it to display in the page. +.. _publish-dataset: + Publish Dataset =============== @@ -694,6 +711,8 @@ Whenever you edit your dataset, you are able to publish a new version of the dat Note: Prior to publishing your dataset the Data Citation will indicate that this is a draft but the "DRAFT VERSION" text will be removed as soon as you Publish. +.. _submit-for-review: + Submit for Review ================= @@ -704,7 +723,7 @@ If you have a Contributor role (can edit metadata, upload files, and edit files, Preview URL to Review Unpublished Dataset ========================================= -Creating a Preview URL for a draft version of your dataset allows you to share your dataset (for viewing and downloading of files) before it is published to a wide group of individuals who may not have a user account on the Dataverse installation. Anyone you send the Preview URL to will not have to log into the Dataverse installation to view the unpublished dataset. Once a dataset has been published you may create new General Preview URLs for subsequent draft versions, but the Anonymous Preview URL will no longer be available. +Creating a Preview URL for a draft version of your dataset allows you to share your dataset (for viewing and downloading files, including :ref:`restricted ` and :ref:`embargoed ` files) before it is published to a wide group of people who might not have a user account on the Dataverse installation. Anyone you send the Preview URL to will not have to log in to the Dataverse installation to view the unpublished dataset. Once a dataset has been published, you may create new General Preview URLs for subsequent draft versions, but the Anonymous Preview URL will no longer be available. **Note:** To create a Preview URL, you must have the *ManageDatasetPermissions* permission for your draft dataset, usually given by the :ref:`roles ` *Curator* or *Administrator*. @@ -726,6 +745,8 @@ To disable a Preview URL and to revoke access, follow the same steps as above un Note that only one Preview URL (normal or with anonymized access) can be configured per dataset at a time. +.. _embargoes: + Embargoes ========= @@ -750,6 +771,8 @@ Support for file-level retention periods can also be configured in a Dataverse i Retention periods are intended to support use cases where files must be made unavailable - and in most cases destroyed, e.g. to meet legal requirements - after a certain period or date. Actual destruction is not automatically handled, but would have to be done on the storage if needed. +.. _dataset-versions: + Dataset Versions ================ diff --git a/doc/sphinx-guides/source/user/dataverse-management.rst b/doc/sphinx-guides/source/user/dataverse-management.rst index 15376da0896..4e94bfad256 100755 --- a/doc/sphinx-guides/source/user/dataverse-management.rst +++ b/doc/sphinx-guides/source/user/dataverse-management.rst @@ -119,7 +119,7 @@ Clicking on Permissions will bring you to this page: |image3| -When you access a Dataverse collection's permissions page, you will see three sections: +When you access a Dataverse collection's permissions page, you will see three or four sections: **Permissions:** Here you can decide the requirements that determine which types of users can add datasets and sub Dataverse collections to your Dataverse collection, and what permissions they'll be granted when they do so. @@ -127,6 +127,8 @@ When you access a Dataverse collection's permissions page, you will see three se **Roles:** Here you can reference a full list of roles that can be assigned to users of your Dataverse collection. Each role lists the permissions that it offers. +**Role Assignment History** If enabled, you'll be able to see the history of when roles have been assigned and revoked and by whom. + Please note that even on a newly created Dataverse collection, you may see user and groups have already been granted role(s) if your installation has ``:InheritParentRoleAssignments`` set. For more on this setting, see the :doc:`/installation/config` section of the Installation Guide. Setting Access Configurations diff --git a/doc/sphinx-guides/source/user/img/access-data.svg b/doc/sphinx-guides/source/user/img/access-data.svg new file mode 100644 index 00000000000..ef27df08523 --- /dev/null +++ b/doc/sphinx-guides/source/user/img/access-data.svg @@ -0,0 +1,95 @@ + + + +Access a DatasetHow to access and download data from +a dataset diff --git a/doc/sphinx-guides/source/user/img/publish-a-collection.svg b/doc/sphinx-guides/source/user/img/publish-a-collection.svg new file mode 100644 index 00000000000..bd4b2737048 --- /dev/null +++ b/doc/sphinx-guides/source/user/img/publish-a-collection.svg @@ -0,0 +1,111 @@ + + + +Publish a CollectionA quickstart guide to create and publish +a data collection diff --git a/doc/sphinx-guides/source/user/img/publish-a-dataset.svg b/doc/sphinx-guides/source/user/img/publish-a-dataset.svg new file mode 100644 index 00000000000..47708db6e2e --- /dev/null +++ b/doc/sphinx-guides/source/user/img/publish-a-dataset.svg @@ -0,0 +1,89 @@ + + + +Publish a DatasetA quickstart guide to create and publish +a dataset diff --git a/doc/sphinx-guides/source/user/img/what-is-dataverse.svg b/doc/sphinx-guides/source/user/img/what-is-dataverse.svg new file mode 100644 index 00000000000..b1056d875d9 --- /dev/null +++ b/doc/sphinx-guides/source/user/img/what-is-dataverse.svg @@ -0,0 +1,116 @@ + + + +What is Dataverse?A quick look at what a Dataverse +repository is + + + + + diff --git a/doc/sphinx-guides/source/user/index.rst b/doc/sphinx-guides/source/user/index.rst index cd6ccdbd421..7a196afe476 100755 --- a/doc/sphinx-guides/source/user/index.rst +++ b/doc/sphinx-guides/source/user/index.rst @@ -6,6 +6,11 @@ User Guide ================================================= +|what-is-dataverse| +|access-data| +|publish-a-dataset| +|publish-a-collection| + **Contents:** .. toctree:: @@ -17,3 +22,23 @@ User Guide dataset-management tabulardataingest/index appendix + +.. |what-is-dataverse| image:: ./img/what-is-dataverse.svg + :scale: 70% + :alt: What is Dataverse button + :target: ../quickstart/what-is-dataverse.html + +.. |access-data| image:: ./img/access-data.svg + :scale: 70% + :alt: Access Data button + :target: ../user/find-use-data.html + +.. |publish-a-dataset| image:: ./img/publish-a-dataset.svg + :scale: 70% + :alt: Publish a Dataset button + :target: ../quickstart/publish-a-dataset.html + +.. |publish-a-collection| image:: ./img/publish-a-collection.svg + :scale: 70% + :alt: Publish a Collection button + :target: ../quickstart/publish-a-collection.html diff --git a/doc/sphinx-guides/source/versions.rst b/doc/sphinx-guides/source/versions.rst index 393b2b07e97..c1035db8d89 100755 --- a/doc/sphinx-guides/source/versions.rst +++ b/doc/sphinx-guides/source/versions.rst @@ -8,6 +8,7 @@ This list provides a way to refer to the documentation for previous and future v - pre-release `HTML (not final!) `__ and `PDF (experimental!) `__ built from the :doc:`develop ` branch :doc:`(how to contribute!) ` - |version| +- `6.8 `__ - `6.7.1 `__ - `6.7 `__ - `6.6 `__ diff --git a/docker/util/intellij/cpwebapp.sh b/docker/util/intellij/cpwebapp.sh index 2d08fb1a873..9ba0ca6d1a5 100755 --- a/docker/util/intellij/cpwebapp.sh +++ b/docker/util/intellij/cpwebapp.sh @@ -9,15 +9,26 @@ PROJECT_DIR="$1" FILE_TO_COPY="$2" RELATIVE_PATH="${FILE_TO_COPY#"${PROJECT_DIR}/"}" -# Check if RELATIVE_PATH starts with 'src/main/webapp', otherwise ignore +# Only act on files under src/main/webapp if [[ "$RELATIVE_PATH" == "src/main/webapp"* ]]; then - # Extract version from POM, so we don't need to have Maven on the PATH - VERSION=$(grep -oPm1 "(?<=)[^<]+" "$PROJECT_DIR/modules/dataverse-parent/pom.xml") + POM="$PROJECT_DIR/modules/dataverse-parent/pom.xml" - # Construct the target path by cutting off the local prefix and prepend with in-container path - RELATIVE_PATH_WITHOUT_WEBAPP="${RELATIVE_PATH#src/main/webapp/}" - TARGET_PATH="/opt/payara/appserver/glassfish/domains/domain1/applications/dataverse-$VERSION/${RELATIVE_PATH_WITHOUT_WEBAPP}" + # Extract in a portable way + VERSION="$(awk -F'[<>]' '//{print $3; exit}' "$POM")" - # Copy file to container - docker cp "$FILE_TO_COPY" "dev_dataverse:$TARGET_PATH" + if [[ -z "${VERSION:-}" ]]; then + echo "Error: Could not extract from $POM" >&2 + exit 1 + fi + + CONTAINER="dev_dataverse" + + # Build target path + RELATIVE_PATH_WITHOUT_WEBAPP="${RELATIVE_PATH#src/main/webapp/}" + TARGET_PATH="/opt/payara/appserver/glassfish/domains/domain1/applications/dataverse-$VERSION/${RELATIVE_PATH_WITHOUT_WEBAPP}" + + # Copy file into container + docker cp "$FILE_TO_COPY" "$CONTAINER:$TARGET_PATH" + + echo "Copied $FILE_TO_COPY → $CONTAINER:$TARGET_PATH" fi diff --git a/local_lib/io/gdcc/xoai-common/5.3.2.1-local/xoai-common-5.3.2.1-local.jar b/local_lib/io/gdcc/xoai-common/5.3.2.1-local/xoai-common-5.3.2.1-local.jar deleted file mode 100644 index 3154ddbcb1f..00000000000 Binary files a/local_lib/io/gdcc/xoai-common/5.3.2.1-local/xoai-common-5.3.2.1-local.jar and /dev/null differ diff --git a/local_lib/io/gdcc/xoai-common/5.3.2.1-local/xoai-common-5.3.2.1-local.pom b/local_lib/io/gdcc/xoai-common/5.3.2.1-local/xoai-common-5.3.2.1-local.pom deleted file mode 100644 index c272295d0fc..00000000000 --- a/local_lib/io/gdcc/xoai-common/5.3.2.1-local/xoai-common-5.3.2.1-local.pom +++ /dev/null @@ -1,82 +0,0 @@ - - - - - - xoai - io.gdcc - 5.3.2.1-local - - 4.0.0 - - XOAI Commons - xoai-common - OAI-PMH base functionality used for both data and service providers. - - - - jakarta.xml.bind - jakarta.xml.bind-api - - - org.hamcrest - hamcrest - - compile - - - io.gdcc - xoai-xmlio - - - org.codehaus.woodstox - stax2-api - - - - com.fasterxml.woodstox - woodstox-core - runtime - true - - - - - org.junit.jupiter - junit-jupiter - test - - - org.xmlunit - xmlunit-core - test - - - org.xmlunit - xmlunit-matchers - test - - - org.openjdk.jmh - jmh-core - 1.37 - test - - - org.openjdk.jmh - jmh-generator-annprocess - 1.37 - test - - - diff --git a/local_lib/io/gdcc/xoai-data-provider/5.3.2.1-local/xoai-data-provider-5.3.2.1-local.jar b/local_lib/io/gdcc/xoai-data-provider/5.3.2.1-local/xoai-data-provider-5.3.2.1-local.jar deleted file mode 100644 index 8899dccb8a9..00000000000 Binary files a/local_lib/io/gdcc/xoai-data-provider/5.3.2.1-local/xoai-data-provider-5.3.2.1-local.jar and /dev/null differ diff --git a/local_lib/io/gdcc/xoai-data-provider/5.3.2.1-local/xoai-data-provider-5.3.2.1-local.pom b/local_lib/io/gdcc/xoai-data-provider/5.3.2.1-local/xoai-data-provider-5.3.2.1-local.pom deleted file mode 100644 index 457991d699b..00000000000 --- a/local_lib/io/gdcc/xoai-data-provider/5.3.2.1-local/xoai-data-provider-5.3.2.1-local.pom +++ /dev/null @@ -1,72 +0,0 @@ - - - - - - xoai - io.gdcc - 5.3.2.1-local - - - 4.0.0 - - XOAI Data Provider - xoai-data-provider - OAI-PMH data provider implementation. Use it to build an OAI-PMH endpoint, providing your data records as harvestable resources. - - - - - org.apache.maven.plugins - maven-jar-plugin - - - - test-jar - - - - - - - - - - io.gdcc - xoai-common - ${project.version} - - - org.slf4j - slf4j-api - - - - - org.junit.jupiter - junit-jupiter - test - - - org.xmlunit - xmlunit-core - test - - - org.xmlunit - xmlunit-matchers - test - - - org.slf4j - slf4j-simple - test - - - diff --git a/local_lib/io/gdcc/xoai-service-provider/5.3.2.1-local/xoai-service-provider-5.3.2.1-local.jar b/local_lib/io/gdcc/xoai-service-provider/5.3.2.1-local/xoai-service-provider-5.3.2.1-local.jar deleted file mode 100644 index be8b7b2fdc3..00000000000 Binary files a/local_lib/io/gdcc/xoai-service-provider/5.3.2.1-local/xoai-service-provider-5.3.2.1-local.jar and /dev/null differ diff --git a/local_lib/io/gdcc/xoai-service-provider/5.3.2.1-local/xoai-service-provider-5.3.2.1-local.pom b/local_lib/io/gdcc/xoai-service-provider/5.3.2.1-local/xoai-service-provider-5.3.2.1-local.pom deleted file mode 100644 index 9a47a40f5f5..00000000000 --- a/local_lib/io/gdcc/xoai-service-provider/5.3.2.1-local/xoai-service-provider-5.3.2.1-local.pom +++ /dev/null @@ -1,65 +0,0 @@ - - - - - - io.gdcc - xoai - 5.3.2.1-local - - 4.0.0 - - XOAI Service Provider - xoai-service-provider - OAI-PMH service provider implementation. Use it as a harvesting client to read remote repositories. - - - - io.gdcc - xoai-common - ${project.version} - - - io.gdcc - xoai-xmlio - ${project.version} - - - - org.slf4j - slf4j-api - - - - - io.gdcc - xoai-data-provider - ${project.version} - test - - - io.gdcc - xoai-data-provider - ${project.version} - test-jar - test - - - org.junit.jupiter - junit-jupiter - test - - - org.slf4j - slf4j-simple - test - - - - diff --git a/local_lib/io/gdcc/xoai-xmlio/5.3.2.1-local/xoai-xmlio-5.3.2.1-local.jar b/local_lib/io/gdcc/xoai-xmlio/5.3.2.1-local/xoai-xmlio-5.3.2.1-local.jar deleted file mode 100644 index 634f017657a..00000000000 Binary files a/local_lib/io/gdcc/xoai-xmlio/5.3.2.1-local/xoai-xmlio-5.3.2.1-local.jar and /dev/null differ diff --git a/local_lib/io/gdcc/xoai-xmlio/5.3.2.1-local/xoai-xmlio-5.3.2.1-local.pom b/local_lib/io/gdcc/xoai-xmlio/5.3.2.1-local/xoai-xmlio-5.3.2.1-local.pom deleted file mode 100644 index f7b1c608c05..00000000000 --- a/local_lib/io/gdcc/xoai-xmlio/5.3.2.1-local/xoai-xmlio-5.3.2.1-local.pom +++ /dev/null @@ -1,63 +0,0 @@ - - 4.0.0 - - - io.gdcc - xoai - 5.3.2.1-local - - - xoai-xmlio - jar - XOAI XML IO Commons - Basic XML IO routines used for XOAI OAI-PMH implementation. Forked from obsolete Lyncode sources. - - - - The Apache Software License, Version 2.0 - https://www.apache.org/licenses/LICENSE-2.0.txt - repo - - - - - - org.codehaus.woodstox - stax2-api - - - - com.fasterxml.woodstox - woodstox-core - runtime - true - - - - - org.hamcrest - hamcrest - - - - - org.xmlunit - xmlunit-core - test - - - org.xmlunit - xmlunit-matchers - test - - - org.junit.jupiter - junit-jupiter - test - - - diff --git a/local_lib/io/gdcc/xoai/5.3.2.1-local/xoai-5.3.2.1-local.pom b/local_lib/io/gdcc/xoai/5.3.2.1-local/xoai-5.3.2.1-local.pom deleted file mode 100644 index 04d71ecfb71..00000000000 --- a/local_lib/io/gdcc/xoai/5.3.2.1-local/xoai-5.3.2.1-local.pom +++ /dev/null @@ -1,235 +0,0 @@ - - - - 4.0.0 - pom - - - io.gdcc - parent - 0.10.2 - - - - xoai-common - xoai-data-provider - xoai-service-provider - xoai-xmlio - report - xoai-data-provider-tck - - - xoai - 5.3.2.1-local - - XOAI : OAI-PMH Java Toolkit - - An OAI-PMH data and/or service provider implementation, integration ready for your service. - https://github.com/${project.github.org}/${project.github.repo} - - - 11 - xoai - true - - - 4.0.1 - 4.0.4 - 4.2.2 - 7.0.0 - - - 10.0.4 - - - - - DuraSpace BSD License - https://raw.github.com/DSpace/DSpace/master/LICENSE - repo - - A BSD 3-Clause license for the DSpace codebase. - - - - - - - - - com.diffplug.spotless - spotless-maven-plugin - ${spotless.version} - - - origin/branch-5.0 - - - - - - *.md - .gitignore - - - - - - true - 4 - - - - - - - - - - - - - 1.15.0 - - true - - - - - - - - - - - - - - - - - jakarta.xml.bind - jakarta.xml.bind-api - ${jakarta.jaxb.version} - - - - com.sun.xml.bind - jaxb-impl - ${jakarta.jaxb-impl.version} - runtime - true - - - - - com.fasterxml.woodstox - woodstox-core - ${woodstox.version} - - - org.codehaus.woodstox - stax2-api - ${stax2.api.version} - - - - io.gdcc - xoai-xmlio - ${project.version} - - - - - - - Oliver Bertuch - https://github.com/poikilotherm - xoai-lib@gdcc.io - Forschungszentrum Jülich GmbH - https://www.fz-juelich.de/en/zb - - - DSpace @ Lyncode - dspace@lyncode.com - Lyncode - http://www.lyncode.com - - - - - - coverage - - ${maven.multiModuleProjectDirectory}/report/target/site/jacoco-aggregate/jacoco.xml - - - - benchmark - - - - org.apache.maven.plugins - maven-surefire-plugin - ${maven-surefire-plugin.version} - - ${skipUT} - **/*Benchmark - - true - - - - - - - - owasp - - - - - org.owasp - dependency-check-maven - ${dependency-check-maven.version} - - 7 - true - true - - SARIF - owaspSuppression.xml - - - - - check - - - - - - - - - - diff --git a/makefile b/makefile index 315ff9c508c..1ffdb627275 100644 --- a/makefile +++ b/makefile @@ -1,5 +1,5 @@ - -SPHINX_VERSION = $(shell grep "Sphinx" ./doc/sphinx-guides/requirements.txt | awk -F'==' '{print $$2}') +# We use "Sphinx=" to avoid packages like Sphinx-Substitution-Extensions +SPHINX_VERSION = $(shell grep "Sphinx=" ./doc/sphinx-guides/requirements.txt | awk -F'==' '{print $$2}') docs-html: docker run -it --rm -v $$(pwd):/docs sphinxdoc/sphinx:$(SPHINX_VERSION) bash -c "cd doc/sphinx-guides && pip3 install -r requirements.txt && make clean && make html" @@ -11,4 +11,4 @@ docs-epub: docs-all: docker run -it --rm -v $$(pwd):/docs sphinxdoc/sphinx:$(SPHINX_VERSION) bash -c "cd doc/sphinx-guides && pip3 install -r requirements.txt && make clean && make html && make epub" - docker run -it --rm -v $$(pwd):/docs sphinxdoc/sphinx-latexpdf:$(SPHINX_VERSION) bash -c "cd doc/sphinx-guides && pip3 install -r requirements.txt && make latexpdf LATEXMKOPTS=\"-interaction=nonstopmode\"; cd ../.. && ls -1 doc/sphinx-guides/build/latex/Dataverse.pdf" \ No newline at end of file + docker run -it --rm -v $$(pwd):/docs sphinxdoc/sphinx-latexpdf:$(SPHINX_VERSION) bash -c "cd doc/sphinx-guides && pip3 install -r requirements.txt && make latexpdf LATEXMKOPTS=\"-interaction=nonstopmode\"; cd ../.. && ls -1 doc/sphinx-guides/build/latex/Dataverse.pdf" diff --git a/modules/container-base/README.md b/modules/container-base/README.md index f6854482073..bfe3da3d08e 100644 --- a/modules/container-base/README.md +++ b/modules/container-base/README.md @@ -13,10 +13,6 @@ this image for other purposes than the Dataverse application. ## Quick Reference -**Maintained by:** - -This image is created, maintained and supported by the Dataverse community on a best-effort basis. - **Where to find documentation:** The [Dataverse Container Guide - Base Image](https://guides.dataverse.org/en/latest/container/base-image.html) @@ -24,8 +20,7 @@ provides in-depth information about content, building, tuning and so on for this **Where to get help and ask questions:** -IQSS will not offer support on how to deploy or run it. Please reach out to the community for help on using it. -You can join the Community Chat on Matrix at https://chat.dataverse.org and https://groups.google.com/g/dataverse-community +You can join the Community Chat at https://chat.dataverse.org and https://groups.google.com/g/dataverse-community to ask for help and guidance. ## Supported Image Tags diff --git a/modules/container-configbaker/Dockerfile b/modules/container-configbaker/Dockerfile index 5532cda1a9e..9fc876a283b 100644 --- a/modules/container-configbaker/Dockerfile +++ b/modules/container-configbaker/Dockerfile @@ -23,6 +23,8 @@ ENV PATH="${PATH}:${SCRIPT_DIR}" \ ARG PKGS="bc curl dnsutils dumb-init ed jq netcat-openbsd postgresql-client" # renovate: datasource=github-releases depName=wait4x/wait4x ARG WAIT4X_VERSION="v3.2.0" +# renove: datasource=github-releases depName=mikefarah/yq +ARG YQ_VERSION="v4.47.1" # renovate: datasource=pypi depName=awscli ARG AWSCLI_VERSION="1.40.15" ARG PYTHON_PKGS="awscli==${AWSCLI_VERSION}" @@ -65,7 +67,11 @@ RUN true && \ echo "$(cat /tmp/w4x-checksum | cut -f1 -d" ") /usr/bin/wait4x.tar.gz" | sha256sum -c - && \ tar -xzf /usr/bin/wait4x.tar.gz -C /usr/bin && chmod +x /usr/bin/wait4x && \ - # 2. Python packages + # 2. yq-go \ + curl -sSfL -o /usr/bin/yq "https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_${ARCH}" && \ + chmod +x /usr/bin/yq && \ + + # 3. Python packages pipx install --global ${PYTHON_PKGS} # Get in the scripts @@ -81,7 +87,7 @@ COPY --from=solr /opt/solr/server/solr/configsets/_default ${SOLR_TEMPLATE}/ COPY maven/solr/*.xml ${SOLR_TEMPLATE}/conf/ RUN rm ${SOLR_TEMPLATE}/conf/managed-schema.xml - +WORKDIR ${SCRIPT_DIR} # Set the entrypoint to tini (as a process supervisor) ENTRYPOINT ["/usr/bin/dumb-init", "--"] # By default run a script that will print a help message and terminate diff --git a/modules/container-configbaker/README.md b/modules/container-configbaker/README.md index 75862ee0809..d05fa2103ff 100644 --- a/modules/container-configbaker/README.md +++ b/modules/container-configbaker/README.md @@ -7,10 +7,6 @@ You may use this image as is, base your own derivative image on it or use bind m ## Quick Reference -**Maintained by:** - -This image is created, maintained and supported by the Dataverse community on a best-effort basis. - **Where to find documentation:** The [Dataverse Container Guide - Config Baker Image](https://guides.dataverse.org/en/latest/container/configbaker-image.html) @@ -18,8 +14,7 @@ provides information about this image. **Where to get help and ask questions:** -IQSS will not offer support on how to deploy or run it. Please reach out to the community for help on using it. -You can join the Community Chat on Matrix at https://chat.dataverse.org and https://groups.google.com/g/dataverse-community +You can join the Community Chat at https://chat.dataverse.org and https://groups.google.com/g/dataverse-community to ask for help and guidance. ## Supported Image Tags diff --git a/modules/container-configbaker/scripts/apply-db-settings.sh b/modules/container-configbaker/scripts/apply-db-settings.sh new file mode 100755 index 00000000000..deb897d138c --- /dev/null +++ b/modules/container-configbaker/scripts/apply-db-settings.sh @@ -0,0 +1,135 @@ +#!/usr/bin/env bash + +# [INFO]: Idempotent replacement of all database settings from a file source. + +set -euo pipefail + +function usage() { + echo "Usage: $(basename "$0") [-h] [-u instanceUrl] [-t timeout] [-c configFile] [-b unblockKey] [-e envSource]" + echo "" + echo "Replace all Database Settings in a running Dataverse installation in an idempotent way." + echo "" + echo "Parameters:" + echo "instanceUrl - Location on container network where to reach your instance. Default: 'http://dataverse:8080'" + echo " Can be set as environment variable 'DATAVERSE_URL'." + echo " timeout - Provide how long to wait for the instance to become available (using wait4x). Default: '3m'" + echo " Can be set as environment variable 'TIMEOUT'." + echo " configFile - Path to a JSON, YAML, PROPERTIES or TOML file containing your settings. Default: '/dv/db-opts.yml'" + echo " Can be set as environment variable 'CONFIG_FILE'. May contain \${var} references to env. vars." + echo " unblockKey - Either string or path to a file with the Admin API Unblock Key. Optional for localhost. No default." + echo " Can be set as environment variable 'ADMIN_API_UNBLOCK_KEY'." + echo " envSource - Path to a file or directory used as source for additional environment variables." + echo " Optional, no default. Can be set as environment variable 'ENV_SOURCE'." + echo " Environment variables from this file or directory structure will be script-local." + echo "" + echo "Note: This script will wait for the Dataverse instance to be available before executing the replacement." + echo " Be careful - this script will not stop you from deleting any vital settings." + echo "" + exit 1 +} + +source util/common.sh +source util/read-to-env.sh + +# Check for (the right) yq, jq, and wait4x being available +require_on_path yq +if ! grep -q "https://github.com/mikefarah/yq" <((yq --version)); then + error "You must install yq from https://github.com/mikefarah/yq, not https://github.com/kislyuk/yq" +fi +require_on_path jq +require_on_path wait4x + +# Set some defaults as documented +DATAVERSE_URL=${DATAVERSE_URL:-"http://dataverse:8080"} +ADMIN_API_UNBLOCK_KEY=${ADMIN_API_UNBLOCK_KEY:-""} +TIMEOUT=${TIMEOUT:-"3m"} +CONFIG_FILE=${CONFIG_FILE:-"/dv/db-opts.yml"} +ENV_SOURCE=${ENV_SOURCE:-""} + +while getopts "u:t:c:b:e:h" OPTION +do + case "$OPTION" in + u) DATAVERSE_URL="$OPTARG" ;; + t) TIMEOUT="$OPTARG" ;; + c) CONFIG_FILE="$OPTARG" ;; + b) ADMIN_API_UNBLOCK_KEY="$OPTARG" ;; + e) ENV_SOURCE="$OPTARG" ;; + h) usage;; + \?) usage;; + esac +done +shift $((OPTIND-1)) + +##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### +# PARSE CONFIGURATION + +# In case the env source was given as cmd arg, parse it +if [ -n "$ENV_SOURCE" ]; then + read_to_env "$ENV_SOURCE" +fi + +# Check for file with DB options given, file present and readable as well as parseable by yq +# If parseable, render as JSON to temp file +CONV_CONF_FILE=$(mktemp) +if [ -f "${CONFIG_FILE}" ] && [ -r "${CONFIG_FILE}" ]; then + # See https://mikefarah.gitbook.io/yq/operators/env-variable-operators#tip + yq -M -o json '(.. | select(tag == "!!str")) |= envsubst(nu)' "${CONFIG_FILE}" > "${CONV_CONF_FILE}" || error "Could not parse config file with yq from '${CONFIG_FILE}'." + # TODO: think about adding a debug switch here, not just print + # cat "$CONV_CONF_FILE" +else + error "Could not read a config file at '${CONFIG_FILE}'." +fi + +##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### ##### +# API INTERACTION + +# Define an auth header argument (enabling usage of different ways) +AUTH_HEADER_ARG="" + +# Check for Dataverse Unblock API Key present (option with file/env var) +# This is only required if the host is not localhost (then there may be no key necessary) +if ! [[ "${DATAVERSE_URL}" == *"://localhost"* ]] || [ -n "${ADMIN_API_UNBLOCK_KEY}" ]; then + # The argument should not be empty + if [ -z "${ADMIN_API_UNBLOCK_KEY}" ]; then + error "You must provide the Dataverse API Unblock Key to this script." + # In case it's not empty, check if it's a file path and read the key from there + elif [ -f "${ADMIN_API_UNBLOCK_KEY}" ] && [ -r "${ADMIN_API_UNBLOCK_KEY}" ]; then + echo "Reading Dataverse API Unblock Key from ${ADMIN_API_UNBLOCK_KEY}." + if ! API_KEY_FILE_CONTENT=$(cat "${ADMIN_API_UNBLOCK_KEY}" 2>/dev/null); then + error "Could not read unblock key from file ${ADMIN_API_UNBLOCK_KEY}." + fi + # Validate the key is not empty + if [ -z "${API_KEY_FILE_CONTENT}" ]; then + error "API key file ${ADMIN_API_UNBLOCK_KEY} appears empty." + fi + ADMIN_API_UNBLOCK_KEY="$API_KEY_CONTENT" + fi + # Very basic error check (as there is no clear format or formal spec for the key) + if [ ${#ADMIN_API_UNBLOCK_KEY} -lt 5 ]; then + error "API key appears to be too short (<5 chars)." + fi + + # Build the header argument for Admin API Authentication via unblock key + AUTH_HEADER_ARG="X-Dataverse-unblock-key: ${ADMIN_API_UNBLOCK_KEY}" +fi + +# Check or wait for Dataverse API being responsive +echo "Waiting for ${DATAVERSE_URL} to become ready in max ${TIMEOUT}." +wait4x http "${DATAVERSE_URL}/api/info/version" -i 8s -t "$TIMEOUT" --expect-status-code 200 --expect-body-json data.version + +# Check for Dataverse Admin API endpoints being reachable by retrieving the current DB options, expect blockades! +CURRENT_SETTINGS=$(mktemp) +echo "Retrieving settings from running instance." +# TODO: Do we need to support pre v6.7 style unblock key query parameter? +curl -sSL --fail-with-body -o "${CURRENT_SETTINGS}" -H "${AUTH_HEADER_ARG}" "${DATAVERSE_URL}/api/admin/settings" \ + || error "Failed. Response message: $( cat "${CURRENT_SETTINGS}")" \ + && echo "Success!" + # TODO: while it's nice to have the current settings written out, it may contain sensitive information (so don't). + # && ( echo "Success! Current settings: "; jq '.data' < "$CURRENT_SETTINGS" ) + +# We need to make the settings update atomic. +echo "Replacing settings." +RESPONSE=$(mktemp) +curl -sSL --fail-with-body -o "${RESPONSE}" -X PUT -H "${AUTH_HEADER_ARG}" --json @"${CONV_CONF_FILE}" "${DATAVERSE_URL}/api/admin/settings" \ + || error "Failed. Response message: $( jq ".message" < "${RESPONSE}" )" \ + && ( echo -e "Success!\nOperations executed: "; jq '.data' < "$RESPONSE" ) diff --git a/modules/container-configbaker/scripts/bootstrap/demo/init.sh b/modules/container-configbaker/scripts/bootstrap/demo/init.sh index aa73cb5edff..b2735b50b28 100644 --- a/modules/container-configbaker/scripts/bootstrap/demo/init.sh +++ b/modules/container-configbaker/scripts/bootstrap/demo/init.sh @@ -31,7 +31,7 @@ fi echo "" echo "Revoke the key that allows for creation of builtin users..." -curl -sS -X DELETE "${DATAVERSE_URL}/api/admin/settings/BuiltinUsers.KEY" +curl -sS -X DELETE "${DATAVERSE_URL}/api/admin/settings/:BuiltinUsersKey" # TODO: stop using these deprecated database settings. See https://github.com/IQSS/dataverse/pull/11454 echo "" diff --git a/modules/container-configbaker/scripts/util/common.sh b/modules/container-configbaker/scripts/util/common.sh new file mode 100644 index 00000000000..91de5257a5c --- /dev/null +++ b/modules/container-configbaker/scripts/util/common.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash + +function error { + echo "ERROR:" "$@" >&2 + exit 2 +} + +function exists_on_path { + type "$1" >/dev/null 2>&1 && return 0 + ( IFS=:; for p in $PATH; do [ -x "${p%/}/$1" ] && return 0 || echo "${p%/}/$1"; done; return 1 ) +} + +function require_on_path { + if ! exists_on_path "$1"; then + error "No $1 executable found on PATH." + fi +} diff --git a/modules/container-configbaker/scripts/util/read-to-env.sh b/modules/container-configbaker/scripts/util/read-to-env.sh new file mode 100644 index 00000000000..485586521ab --- /dev/null +++ b/modules/container-configbaker/scripts/util/read-to-env.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +set -euo pipefail + +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +# Read from a target into environment variables. +# Parameters: $target +# Case A) If $target is a file, simply source it. +# Case B) If $target is a directory, parse dirs and files in it as variable names and file content as value +function read_to_env() { + local target="$1" + + if [ -f "$target" ] && [ -r "$target" ]; then + set -o allexport + # shellcheck disable=SC1090 + source "$target" + set +o allexport + elif [ -d "$target" ] && [ -r "$target" ] && [ -x "$target" ]; then + # Find all files (K8s secrets are symlinks, so look for not directory & remove the hidden mounted files.) + FILES=$( find "$target" -not -type d -printf '%P\n' | grep -v '^\.\.' ) + for FILE in $FILES; do + # Same as MPCONFIG does! + VARNAME=$( echo "$FILE" | tr '[:lower:]' '[:upper:]' | tr '/' '_' ) + VARVAL=$( cat "$target/$FILE") + + # Use printf to create the variable in global scope + printf -v "$VARNAME" '%s' "$VARVAL" + export "${VARNAME?}" + done + else + error "'$target' not a (readable) environment file or directory" + fi +} diff --git a/modules/dataverse-parent/pom.xml b/modules/dataverse-parent/pom.xml index 013034f14d1..6ed02dda20c 100644 --- a/modules/dataverse-parent/pom.xml +++ b/modules/dataverse-parent/pom.xml @@ -132,7 +132,7 @@ - 6.8 + 6.9 17 UTF-8 @@ -149,9 +149,10 @@ -Duser.timezone=${project.timezone} -Dfile.encoding=${project.build.sourceEncoding} -Duser.language=${project.language} -Duser.region=${project.region} - 6.2025.3 + 6.2025.10 42.7.7 9.8.0 + 16 2.33.0 26.30.0 @@ -165,11 +166,10 @@ 4.4.14 - - 5.3.2.1-local + 5.3.0 - 1.19.7 + 2.0.2 3.7.1 5.10.2 5.11.0 @@ -419,15 +419,22 @@ unidata-all Unidata All https://artifacts.unidata.ucar.edu/repository/unidata-all/ + + false + + + + --> diff --git a/modules/dataverse-spi/pom.xml b/modules/dataverse-spi/pom.xml index b00053fe5e0..a603e274234 100644 --- a/modules/dataverse-spi/pom.xml +++ b/modules/dataverse-spi/pom.xml @@ -13,7 +13,7 @@ io.gdcc dataverse-spi - 2.0.0${project.version.suffix} + 2.1.0${project.version.suffix} jar Dataverse SPI Plugin API @@ -64,11 +64,13 @@ - ossrh - https://s01.oss.sonatype.org/content/repositories/snapshots + central + https://central.sonatype.com/repository/maven-snapshots/ + ossrh + https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/ @@ -110,7 +112,9 @@ nexus-staging-maven-plugin true + ossrh + https://s01.oss.sonatype.org true diff --git a/modules/dataverse-spi/src/main/java/io/gdcc/spi/export/ExportDataContext.java b/modules/dataverse-spi/src/main/java/io/gdcc/spi/export/ExportDataContext.java new file mode 100644 index 00000000000..9478d39c4c2 --- /dev/null +++ b/modules/dataverse-spi/src/main/java/io/gdcc/spi/export/ExportDataContext.java @@ -0,0 +1,61 @@ +package io.gdcc.spi.export; + +/** + * + * @author landreev + * Provides an optional mechanism for defining various data retrieval options + * for the export subsystem in a way that should allow us adding support for + * more options going forward with minimal or no changes to the already + * implemented export plugins. + */ +public class ExportDataContext { + private boolean datasetMetadataOnly = false; + private boolean publicFilesOnly = false; + private Integer offset = null; + private Integer length = null; + + private ExportDataContext() { + + } + + public static ExportDataContext context() { + ExportDataContext context = new ExportDataContext(); + return context; + } + + public ExportDataContext withDatasetMetadataOnly() { + this.datasetMetadataOnly = true; + return this; + } + + public ExportDataContext withPublicFilesOnly() { + this.publicFilesOnly = true; + return this; + } + + public ExportDataContext withOffset(Integer offset) { + this.offset = offset; + return this; + } + + public ExportDataContext withLength(Integer length) { + this.length = length; + return this; + } + + public boolean isDatasetMetadataOnly() { + return datasetMetadataOnly; + } + + public boolean isPublicFilesOnly() { + return publicFilesOnly; + } + + public Integer getOffset() { + return offset; + } + + public Integer getLength() { + return length; + } +} diff --git a/modules/dataverse-spi/src/main/java/io/gdcc/spi/export/ExportDataProvider.java b/modules/dataverse-spi/src/main/java/io/gdcc/spi/export/ExportDataProvider.java index d039ac39e8f..4197d978e79 100644 --- a/modules/dataverse-spi/src/main/java/io/gdcc/spi/export/ExportDataProvider.java +++ b/modules/dataverse-spi/src/main/java/io/gdcc/spi/export/ExportDataProvider.java @@ -21,8 +21,14 @@ public interface ExportDataProvider { * OAI_ORE export are the only two that provide 'complete' * dataset-level metadata along with basic file metadata for each file * in the dataset. + * @param context - supplies optional parameters. Needs to support + * context.isDatasetMetadataOnly(). In a situation where we + * need to generate a format like DC that has no use for the + * file-level metadata, it makes sense to skip retrieving and + * formatting it, since there can be a very large number of + * files in a dataset. */ - JsonObject getDatasetJson(); + JsonObject getDatasetJson(ExportDataContext... context); /** * @@ -32,14 +38,15 @@ public interface ExportDataProvider { * @apiNote - THis, and the JSON format are the only two that provide complete * dataset-level metadata along with basic file metadata for each file * in the dataset. + * @param context - supplies optional parameters. */ - JsonObject getDatasetORE(); + JsonObject getDatasetORE(ExportDataContext... context); /** * Dataverse is capable of extracting DDI-centric metadata from tabular * datafiles. This detailed metadata, which is only available for successfully * "ingested" tabular files, is not included in the output of any other methods - * in this interface. + * in this interface. * * @return - a JSONArray with one entry per ingested tabular dataset file. * @apiNote - there is no JSON schema available for this output and the format @@ -47,9 +54,26 @@ public interface ExportDataProvider { * edu.harvard.iq.dataverse.export.DDIExporter and the @see * edu.harvard.iq.dataverse.util.json.JSONPrinter classes where this * output is used/generated (respectively). + * @param context - supplies optional parameters. */ - JsonArray getDatasetFileDetails(); + JsonArray getDatasetFileDetails(ExportDataContext... context); + /** + * Similar to the above, but + * a) retrieves the information for the ingested/tabular data files _only_ + * b) provides an option for retrieving this stuff in batches + * c) provides an option for skipping restricted/embargoed etc. files. + * Intended for datasets with massive numbers of tabular files and datavariables. + * @param context - supplies optional parameters. + * current (2.1.0) known use cases: + * context.isPublicFilesOnly(); + * context.getOffset(); + * context.getLength(); + * @return json array containing the datafile/filemetadata->datatable->datavariable metadata + * @throws ExportException + */ + JsonArray getTabularDataDetails(ExportDataContext ... context) throws ExportException; + /** * * @return - the subset of metadata conforming to the schema.org standard as @@ -58,8 +82,9 @@ public interface ExportDataProvider { * @apiNote - as this metadata export is not complete, it should only be used as * a starting point for an Exporter if it simplifies your exporter * relative to using the JSON or OAI_ORE exports. + * @param context - supplies optional parameters. */ - JsonObject getDatasetSchemaDotOrg(); + JsonObject getDatasetSchemaDotOrg(ExportDataContext... context); /** * @@ -68,8 +93,9 @@ public interface ExportDataProvider { * @apiNote - as this metadata export is not complete, it should only be used as * a starting point for an Exporter if it simplifies your exporter * relative to using the JSON or OAI_ORE exports. + * @param context - supplies optional parameters. */ - String getDataCiteXml(); + String getDataCiteXml(ExportDataContext... context); /** * If an Exporter has specified a prerequisite format name via the @@ -88,9 +114,10 @@ public interface ExportDataProvider { * malfunction, e.g. if you depend on format "ddi" and a third party * Exporter is configured to replace the internal ddi Exporter in * Dataverse. + * @param context - supplies optional parameters. */ - default Optional getPrerequisiteInputStream() { + default Optional getPrerequisiteInputStream(ExportDataContext... context) { return Optional.empty(); } - -} + + } diff --git a/modules/dataverse-spi/src/main/java/io/gdcc/spi/export/Exporter.java b/modules/dataverse-spi/src/main/java/io/gdcc/spi/export/Exporter.java index 1338a3c9734..7132e74641b 100644 --- a/modules/dataverse-spi/src/main/java/io/gdcc/spi/export/Exporter.java +++ b/modules/dataverse-spi/src/main/java/io/gdcc/spi/export/Exporter.java @@ -85,7 +85,6 @@ default Optional getPrerequisiteFormatName() { return Optional.empty(); } - /** * Harvestable Exporters will be available as options in Dataverse's Harvesting mechanism. * @return true to make this exporter available as a harvesting option. diff --git a/pom.xml b/pom.xml index ceb5ea28d84..b844017d8c8 100644 --- a/pom.xml +++ b/pom.xml @@ -20,7 +20,11 @@ false false - integration + integration,migration + + + -Ddummy.jacoco.property=true + -Ddummy.jacoco.property=true @@ -32,7 +36,7 @@ 1.20.1 5.4.0 3.2.2 - 5.5.3 + 5.9.1 Dataverse API ${project.version} @@ -744,36 +748,36 @@ + + org.dbunit + dbunit + 3.0.0 + test + org.testcontainers testcontainers test - - - junit - junit - - org.testcontainers - junit-jupiter + testcontainers-junit-jupiter test org.testcontainers - postgresql + testcontainers-postgresql test com.github.dasniko testcontainers-keycloak - 3.6.0 + 4.0.0 test org.testcontainers - localstack + testcontainers-localstack test + **/jakarta.mime.types **/mime.types **/*.R @@ -1036,7 +1044,13 @@ ${testsToExclude} ${skipUnitTests} - ${surefire.jacoco.args} ${argLine} + + @{surefire.jacoco.args} ${argLine} **/builtin-users-spi/** @@ -1048,8 +1062,17 @@ maven-failsafe-plugin ${it.groups} - ${failsafe.jacoco.args} ${argLine} + + @{failsafe.jacoco.args} ${argLine} ${skipIntegrationTests} + + ${postgresql.server.version} + @@ -1266,6 +1289,7 @@ true + %a diff --git a/scripts/api/data/licenses/licenseEUPL-1.2.json b/scripts/api/data/licenses/licenseEUPL-1.2.json new file mode 100644 index 00000000000..b87d66f0aa8 --- /dev/null +++ b/scripts/api/data/licenses/licenseEUPL-1.2.json @@ -0,0 +1,11 @@ +{ + "name": "EUPL-1.2", + "uri": "https://joinup.ec.europa.eu/page/eupl-text-11-12", + "shortDescription": "European Union Public License 1.2.", + "active": true, + "sortOrder": 14, + "rightsIdentifier": "EUPL-1.2", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" +} diff --git a/scripts/api/data/licenses/licenseODC-By-1.0.json b/scripts/api/data/licenses/licenseODC-By-1.0.json new file mode 100644 index 00000000000..27bd08f0bcf --- /dev/null +++ b/scripts/api/data/licenses/licenseODC-By-1.0.json @@ -0,0 +1,11 @@ +{ + "name": "ODC-By-1.0", + "uri": "https://opendatacommons.org/licenses/by/1.0/", + "shortDescription": "Open Data Commons Attribution License v1.0.", + "active": true, + "sortOrder": 12, + "rightsIdentifier": "ODC-By-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" +} diff --git a/scripts/api/data/licenses/licenseODbL-1.0.json b/scripts/api/data/licenses/licenseODbL-1.0.json new file mode 100644 index 00000000000..cca01be3365 --- /dev/null +++ b/scripts/api/data/licenses/licenseODbL-1.0.json @@ -0,0 +1,11 @@ +{ + "name": "ODbL-1.0", + "uri": "http://www.opendatacommons.org/licenses/odbl/1.0/", + "shortDescription": "Open Data Commons Open Database License v1.0.", + "active": true, + "sortOrder": 11, + "rightsIdentifier": "ODbL-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" +} diff --git a/scripts/api/data/licenses/licenseOGL-UK-3.0.json b/scripts/api/data/licenses/licenseOGL-UK-3.0.json new file mode 100644 index 00000000000..4bc39476af2 --- /dev/null +++ b/scripts/api/data/licenses/licenseOGL-UK-3.0.json @@ -0,0 +1,11 @@ +{ + "name": "OGL-UK-3.0", + "uri": "https://www.nationalarchives.gov.uk/doc/open-government-licence/version/3", + "shortDescription": "Open Government Licence v3.0.", + "active": true, + "sortOrder": 15, + "rightsIdentifier": "OGL-UK-3.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" +} diff --git a/scripts/api/data/licenses/licensePDDL-1.0.json b/scripts/api/data/licenses/licensePDDL-1.0.json new file mode 100644 index 00000000000..12a4fe00bc7 --- /dev/null +++ b/scripts/api/data/licenses/licensePDDL-1.0.json @@ -0,0 +1,11 @@ +{ + "name": "PDDL-1.0", + "uri": "http://opendatacommons.org/licenses/pddl/1.0/", + "shortDescription": "Open Data Commons Public Domain Dedication & License 1.0.", + "active": true, + "sortOrder": 13, + "rightsIdentifier": "PDDL-1.0", + "rightsIdentifierScheme": "SPDX", + "schemeUri": "https://spdx.org/licenses/", + "languageCode": "en" +} diff --git a/scripts/api/data/workflows/internal-coar-notify-relationship-announcement-workflow.json b/scripts/api/data/workflows/internal-coar-notify-relationship-announcement-workflow.json new file mode 100644 index 00000000000..f0cbea63443 --- /dev/null +++ b/scripts/api/data/workflows/internal-coar-notify-relationship-announcement-workflow.json @@ -0,0 +1,16 @@ +{ + "name": "COAR Notify Relationship Announcement workflow", + "steps": [ + { + "provider":":internal", + "stepType":"coarNotifyRelationshipAnnouncement", + "parameters": { + "stepName":"LDN Announce" + }, + "requiredSettings": { + ":COARNotifyRelationshipAnnouncementTriggerFields": "string", + ":COARNotifyRelationshipAnnouncementTargets": "string" + } + } + ] +} diff --git a/scripts/api/data/workflows/internal-ldnannounce-workflow.json b/scripts/api/data/workflows/internal-ldnannounce-workflow.json deleted file mode 100644 index 9cf058b68a1..00000000000 --- a/scripts/api/data/workflows/internal-ldnannounce-workflow.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "name": "LDN Announce workflow", - "steps": [ - { - "provider":":internal", - "stepType":"ldnannounce", - "parameters": { - "stepName":"LDN Announce" - }, - "requiredSettings": { - ":LDNAnnounceRequiredFields": "string", - ":LDNTarget": "string" - } - } - ] -} diff --git a/scripts/api/post-install-api-block.sh b/scripts/api/post-install-api-block.sh index 4cc0ac783f7..f7753665b5b 100755 --- a/scripts/api/post-install-api-block.sh +++ b/scripts/api/post-install-api-block.sh @@ -4,7 +4,7 @@ # the sensitive API endpoints, in order to block it for the general public. # First, revoke the authentication token from the built-in user: -curl -X DELETE $SERVER/admin/settings/BuiltinUsers.KEY +curl -X DELETE "$SERVER/admin/settings/:BuiltinUsersKey" # Block the sensitive endpoints: # Relevant settings: diff --git a/scripts/api/setup-all.sh b/scripts/api/setup-all.sh index b7f962209e4..bd0bd77c52b 100755 --- a/scripts/api/setup-all.sh +++ b/scripts/api/setup-all.sh @@ -57,7 +57,7 @@ echo "- Allow internal signup" curl -X PUT -d yes "${DATAVERSE_URL}/api/admin/settings/:AllowSignUp" curl -X PUT -d "/dataverseuser.xhtml?editMode=CREATE" "${DATAVERSE_URL}/api/admin/settings/:SignUpUrl" -curl -X PUT -d burrito "${DATAVERSE_URL}/api/admin/settings/BuiltinUsers.KEY" +curl -X PUT -d burrito "${DATAVERSE_URL}/api/admin/settings/:BuiltinUsersKey" curl -X PUT -d localhost-only "${DATAVERSE_URL}/api/admin/settings/:BlockedApiPolicy" curl -X PUT -d 'native/http' "${DATAVERSE_URL}/api/admin/settings/:UploadMethods" echo @@ -91,7 +91,7 @@ if [ $SECURESETUP = 1 ] then # Revoke the "burrito" super-key; # Block sensitive API endpoints; - curl -X DELETE "${DATAVERSE_URL}/api/admin/settings/BuiltinUsers.KEY" + curl -X DELETE "${DATAVERSE_URL}/api/admin/settings/:BuiltinUsersKey" curl -X PUT -d 'admin,builtin-users' "${DATAVERSE_URL}/api/admin/settings/:BlockedApiEndpoints" echo "Access to the /api/admin and /api/test is now disabled, except for connections from localhost." else diff --git a/scripts/api/setup-users.sh b/scripts/api/setup-users.sh index 141e1b3150f..7df771dc0fe 100755 --- a/scripts/api/setup-users.sh +++ b/scripts/api/setup-users.sh @@ -5,7 +5,7 @@ SERVER=http://localhost:8080/api echo Setting up users on $SERVER echo ============================================== -curl -X PUT -d burrito $SERVER/admin/settings/BuiltinUsers.KEY +curl -X PUT -d burrito "$SERVER/admin/settings/:BuiltinUsersKey" peteResp=$(curl -s -H "Content-type:application/json" -X POST -d @data/userPete.json "$SERVER/builtin-users?password=pete&key=burrito") diff --git a/scripts/issues/2454/run-test.sh b/scripts/issues/2454/run-test.sh index 49eb45a8a5e..5ae0ac33f4d 100755 --- a/scripts/issues/2454/run-test.sh +++ b/scripts/issues/2454/run-test.sh @@ -39,7 +39,7 @@ if [ $SETUP_NEEDED == "yes" ]; then echo $ROOT_USER api key is $ROOT_KEY # Create @anAuthUser - USER_CREATION_KEY=$($DB "SELECT content FROM setting WHERE name='BuiltinUsers.KEY'") + USER_CREATION_KEY=$($DB "SELECT content FROM setting WHERE name=':BuiltinUsersKey'") AN_AUTH_USER_KEY=$( curl -s -X POST -d@anAuthUser.json -H"Content-type:application/json" $ENDPOINT/builtin-users?password=XXX\&key=$USER_CREATION_KEY | jq .data.apiToken | tr -d \") ANOTHER_AUTH_USER_KEY=$( curl -s -X POST -d@anotherAuthUser.json -H"Content-type:application/json" $ENDPOINT/builtin-users?password=XXX\&key=$USER_CREATION_KEY | jq .data.apiToken | tr -d \") echo diff --git a/src/main/docker/README.md b/src/main/docker/README.md index 48416c196ca..a32c91a810e 100644 --- a/src/main/docker/README.md +++ b/src/main/docker/README.md @@ -24,7 +24,7 @@ for more details on tunable settings, locations, etc. **Where to get help and ask questions:** IQSS will not offer support on how to deploy or run it. Please reach out to the community for help on using it. -You can join the Community Chat on Matrix at https://chat.dataverse.org and https://groups.google.com/g/dataverse-community +You can join the Community Chat at https://chat.dataverse.org and https://groups.google.com/g/dataverse-community to ask for help and guidance. ## Supported Image Tags diff --git a/src/main/java/META-INF/jakarta.mime.types b/src/main/java/META-INF/jakarta.mime.types new file mode 100644 index 00000000000..7e11a630e78 --- /dev/null +++ b/src/main/java/META-INF/jakarta.mime.types @@ -0,0 +1,42 @@ +# Common document formats +application/pdf pdf PDF +application/msword doc DOC +application/vnd.ms-excel xls XLS xlc XLC xll XLL xlm XLM xlw XLW +application/vnd.openxmlformats-officedocument.spreadsheetml.sheet xlsx XLSX +text/comma-separated-values csv CSV +text/plain txt TXT +text/xml xml XML +# Common statistical data formats +text/tsv tab TAB tsv TSV +text/x-fixed-field dat DAT asc ASC +application/x-rlang-transport Rdata RData rdata RDATA +type/x-r-syntax r R +application/x-stata dta DTA +text/x-stata-syntax do DO +application/x-spss-sav sav SAV +application/x-spss-por por POR +text/x-spss-syntax sps SPS +application/x-sas-transport xpt XPT cport CPORT v5x V5X v6x V6X v7x V7X +application/x-sas-system sas7bdat SAS7BDAT sd1 SD1 sd2 SD2 sd7 SD7 ssd01 SSD01 ssd SSD ssd04 SSD04 +text/x-sas-syntax sas SAS +# Common image formats +image/gif gif GIF +image/jpeg jpeg JPEG jpg JPG jpe JPE +image/bmp bmp BMP +image/x-portable-bitmap pbm PBM +image/x-portable-graymap pgm PGM +image/png png PNG +image/x-portable-anymap pnm PNM +image/x-portable-pixmap ppm PPM +image/cmu-raster ras RAS +image/x-rgb rgb RGB +image/tiff tif TIF tiff TIFF +image/x-xbitmap xbm XBM +image/x-xpixmap xpm XPM +image/x-xwindowdump xwd XWD +# Common archive formats +application/zip zip ZIP +application/x-gzip gz GZ +application/x-tar tar TAR +# Rdata +application/octet-stream \ No newline at end of file diff --git a/src/main/java/META-INF/mime.types b/src/main/java/META-INF/mime.types index 7e11a630e78..6f8aaec6d25 100644 --- a/src/main/java/META-INF/mime.types +++ b/src/main/java/META-INF/mime.types @@ -1,42 +1,43 @@ -# Common document formats -application/pdf pdf PDF -application/msword doc DOC -application/vnd.ms-excel xls XLS xlc XLC xll XLL xlm XLM xlw XLW -application/vnd.openxmlformats-officedocument.spreadsheetml.sheet xlsx XLSX -text/comma-separated-values csv CSV -text/plain txt TXT -text/xml xml XML -# Common statistical data formats -text/tsv tab TAB tsv TSV -text/x-fixed-field dat DAT asc ASC -application/x-rlang-transport Rdata RData rdata RDATA -type/x-r-syntax r R -application/x-stata dta DTA -text/x-stata-syntax do DO -application/x-spss-sav sav SAV -application/x-spss-por por POR -text/x-spss-syntax sps SPS -application/x-sas-transport xpt XPT cport CPORT v5x V5X v6x V6X v7x V7X -application/x-sas-system sas7bdat SAS7BDAT sd1 SD1 sd2 SD2 sd7 SD7 ssd01 SSD01 ssd SSD ssd04 SSD04 -text/x-sas-syntax sas SAS -# Common image formats -image/gif gif GIF -image/jpeg jpeg JPEG jpg JPG jpe JPE -image/bmp bmp BMP -image/x-portable-bitmap pbm PBM -image/x-portable-graymap pgm PGM -image/png png PNG -image/x-portable-anymap pnm PNM -image/x-portable-pixmap ppm PPM -image/cmu-raster ras RAS -image/x-rgb rgb RGB -image/tiff tif TIF tiff TIFF -image/x-xbitmap xbm XBM -image/x-xpixmap xpm XPM -image/x-xwindowdump xwd XWD -# Common archive formats -application/zip zip ZIP -application/x-gzip gz GZ -application/x-tar tar TAR -# Rdata +# mime.types is a copy of jakarta.mime.types which only used in unit tests (specifically FileUtilTest.testGZipFile) +# Common document formats +application/pdf pdf PDF +application/msword doc DOC +application/vnd.ms-excel xls XLS xlc XLC xll XLL xlm XLM xlw XLW +application/vnd.openxmlformats-officedocument.spreadsheetml.sheet xlsx XLSX +text/comma-separated-values csv CSV +text/plain txt TXT +text/xml xml XML +# Common statistical data formats +text/tsv tab TAB tsv TSV +text/x-fixed-field dat DAT asc ASC +application/x-rlang-transport Rdata RData rdata RDATA +type/x-r-syntax r R +application/x-stata dta DTA +text/x-stata-syntax do DO +application/x-spss-sav sav SAV +application/x-spss-por por POR +text/x-spss-syntax sps SPS +application/x-sas-transport xpt XPT cport CPORT v5x V5X v6x V6X v7x V7X +application/x-sas-system sas7bdat SAS7BDAT sd1 SD1 sd2 SD2 sd7 SD7 ssd01 SSD01 ssd SSD ssd04 SSD04 +text/x-sas-syntax sas SAS +# Common image formats +image/gif gif GIF +image/jpeg jpeg JPEG jpg JPG jpe JPE +image/bmp bmp BMP +image/x-portable-bitmap pbm PBM +image/x-portable-graymap pgm PGM +image/png png PNG +image/x-portable-anymap pnm PNM +image/x-portable-pixmap ppm PPM +image/cmu-raster ras RAS +image/x-rgb rgb RGB +image/tiff tif TIF tiff TIFF +image/x-xbitmap xbm XBM +image/x-xpixmap xpm XPM +image/x-xwindowdump xwd XWD +# Common archive formats +application/zip zip ZIP +application/x-gzip gz GZ +application/x-tar tar TAR +# Rdata application/octet-stream \ No newline at end of file diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFileCategoryServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DataFileCategoryServiceBean.java index 29dcb22c3ec..d29b5670952 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFileCategoryServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFileCategoryServiceBean.java @@ -1,6 +1,7 @@ package edu.harvard.iq.dataverse; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; +import static edu.harvard.iq.dataverse.settings.SettingsServiceBean.Key.FileCategories; import edu.harvard.iq.dataverse.util.BundleUtil; import jakarta.ejb.EJB; @@ -21,7 +22,7 @@ @Stateless public class DataFileCategoryServiceBean { - public static final String FILE_CATEGORIES_KEY = ":FileCategories"; + public static final String FILE_CATEGORIES_KEY = FileCategories.toString(); @EJB private SettingsServiceBean settingsService; diff --git a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java index 937f5693511..1c880ce464b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DataFileServiceBean.java @@ -28,18 +28,19 @@ import java.util.Map; import java.util.Set; import java.util.UUID; +import java.util.function.Function; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.stream.Collectors; + import jakarta.ejb.EJB; import jakarta.ejb.Stateless; import jakarta.ejb.TransactionAttribute; import jakarta.ejb.TransactionAttributeType; import jakarta.inject.Named; -import jakarta.persistence.EntityManager; -import jakarta.persistence.NoResultException; -import jakarta.persistence.PersistenceContext; -import jakarta.persistence.Query; -import jakarta.persistence.TypedQuery; +import jakarta.persistence.*; +import jakarta.persistence.criteria.*; +import org.apache.commons.lang3.StringUtils; /** * @@ -281,60 +282,21 @@ public List findFileMetadataByDatasetVersionId(Long datasetVersion .setMaxResults(maxResults) .getResultList(); } - - public List findFileMetadataByDatasetVersionIdLabelSearchTerm(Long datasetVersionId, String searchTerm, String userSuppliedSortField, String userSuppliedSortOrder){ - FileSortFieldAndOrder sortFieldAndOrder = new FileSortFieldAndOrder(userSuppliedSortField, userSuppliedSortOrder); - String sortField = sortFieldAndOrder.getSortField(); - String sortOrder = sortFieldAndOrder.getSortOrder(); - String searchClause = ""; - if(searchTerm != null && !searchTerm.isEmpty()){ - searchClause = " and (lower(o.label) like '%" + searchTerm.toLowerCase() + "%' or lower(o.description) like '%" + searchTerm.toLowerCase() + "%')"; - } - - String queryString = "select o from FileMetadata o where o.datasetVersion.id = :datasetVersionId" - + searchClause - + " order by o." + sortField + " " + sortOrder; - return em.createQuery(queryString, FileMetadata.class) - .setParameter("datasetVersionId", datasetVersionId) - .getResultList(); - } - - public List findFileMetadataIdsByDatasetVersionIdLabelSearchTerm(Long datasetVersionId, String searchTerm, String userSuppliedSortField, String userSuppliedSortOrder){ + public List findDataFileIdsByDatasetVersionIdLabelSearchTerm(Long datasetVersionId, String userSuppliedSearchTerm, String userSuppliedSortField, String userSuppliedSortOrder) { FileSortFieldAndOrder sortFieldAndOrder = new FileSortFieldAndOrder(userSuppliedSortField, userSuppliedSortOrder); - - searchTerm=searchTerm.trim(); - String sortField = sortFieldAndOrder.getSortField(); - String sortOrder = sortFieldAndOrder.getSortOrder(); - String searchClause = ""; - if(searchTerm != null && !searchTerm.isEmpty()){ - searchClause = " and (lower(o.label) like '%" + searchTerm.toLowerCase() + "%' or lower(o.description) like '%" + searchTerm.toLowerCase() + "%')"; - } - - //the createNativeQuary takes persistant entities, which Integer.class is not, - //which is causing the exception. Hence, this query does not need an Integer.class - //as the second parameter. - return em.createNativeQuery("select o.id from FileMetadata o where o.datasetVersion_id = " + datasetVersionId - + searchClause - + " order by o." + sortField + " " + sortOrder) - .getResultList(); - } - - public List findDataFileIdsByDatasetVersionIdLabelSearchTerm(Long datasetVersionId, String searchTerm, String userSuppliedSortField, String userSuppliedSortOrder){ - FileSortFieldAndOrder sortFieldAndOrder = new FileSortFieldAndOrder(userSuppliedSortField, userSuppliedSortOrder); - - searchTerm=searchTerm.trim(); - String sortField = sortFieldAndOrder.getSortField(); - String sortOrder = sortFieldAndOrder.getSortOrder(); - String searchClause = ""; - if(searchTerm != null && !searchTerm.isEmpty()){ - searchClause = " and (lower(o.label) like '%" + searchTerm.toLowerCase() + "%' or lower(o.description) like '%" + searchTerm.toLowerCase() + "%')"; + String searchTerm = !StringUtils.isBlank(userSuppliedSearchTerm) ? "%"+userSuppliedSearchTerm.trim().toLowerCase()+"%" : null; + + String selectClause = "select o.datafile_id from FileMetadata o where o.datasetversion_id = " + datasetVersionId; + String searchClause = searchTerm != null ? " and (lower(o.label) like ? or lower(o.description) like ?)" : ""; + String orderByClause = " order by o." + sortFieldAndOrder.getSortField() + " " + sortFieldAndOrder.getSortOrder(); + + Query query = em.createNativeQuery(selectClause + searchClause + orderByClause); + if (searchTerm != null) { + query.setParameter(1, searchTerm); + query.setParameter(2, searchTerm); } - - return em.createNativeQuery("select o.datafile_id from FileMetadata o where o.datasetVersion_id = " + datasetVersionId - + searchClause - + " order by o." + sortField + " " + sortOrder) - .getResultList(); + return query.getResultList(); } public List findFileMetadataByDatasetVersionIdLazy(Long datasetVersionId, int maxResults, String userSuppliedSortField, String userSuppliedSortOrder, int firstResult) { @@ -376,6 +338,133 @@ public FileMetadata findFileMetadataByDatasetVersionIdAndDataFileId(Long dataset } } + /** + * Finds the complete history of a file's presence across all dataset versions. + *

+ * This method returns a {@link VersionedFileMetadata} entry for every version + * of the specified dataset. If a version does not contain the file, the + * {@code fileMetadata} field in the corresponding DTO will be {@code null}. + * It correctly handles file replacements by searching for all files sharing the + * same {@code rootDataFileId}. + * + * @param datasetId The ID of the parent dataset. + * @param dataFile The DataFile entity to find the history for. + * @param canViewUnpublishedVersions A boolean indicating if the user has permission to view non-released versions. + * @param limit (Optional) The maximum number of results to return. + * @param offset (Optional) The starting point of the result list. + * @return A chronologically sorted, paginated list of the file's version history, including versions where the file is absent. + */ + public List findFileMetadataHistory(Long datasetId, + DataFile dataFile, + boolean canViewUnpublishedVersions, + Integer limit, + Integer offset) { + if (dataFile == null) { + return Collections.emptyList(); + } + + // Query 1: Get the paginated list of relevant DatasetVersions + CriteriaBuilder cb = em.getCriteriaBuilder(); + CriteriaQuery versionQuery = cb.createQuery(DatasetVersion.class); + Root versionRoot = versionQuery.from(DatasetVersion.class); + + List versionPredicates = new ArrayList<>(); + versionPredicates.add(cb.equal(versionRoot.join("dataset").get("id"), datasetId)); + if (!canViewUnpublishedVersions) { + versionPredicates.add(versionRoot.get("versionState").in( + VersionState.RELEASED, VersionState.DEACCESSIONED)); + } + versionQuery.where(versionPredicates.toArray(new Predicate[0])); + versionQuery.orderBy( + cb.desc(versionRoot.get("versionNumber")), + cb.desc(versionRoot.get("minorVersionNumber")) + ); + + TypedQuery typedVersionQuery = em.createQuery(versionQuery); + if (limit != null) { + typedVersionQuery.setMaxResults(limit); + } + if (offset != null) { + typedVersionQuery.setFirstResult(offset); + } + List datasetVersions = typedVersionQuery.getResultList(); + + if (datasetVersions.isEmpty()) { + return Collections.emptyList(); + } + + // Query 2: Get all FileMetadata for this file's history in this dataset + CriteriaQuery fmQuery = cb.createQuery(FileMetadata.class); + Root fmRoot = fmQuery.from(FileMetadata.class); + + List fmPredicates = new ArrayList<>(); + fmPredicates.add(cb.equal(fmRoot.get("datasetVersion").get("dataset").get("id"), datasetId)); + + // Find the file by its entire lineage + if (dataFile.getRootDataFileId() < 0) { + fmPredicates.add(cb.equal(fmRoot.get("dataFile").get("id"), dataFile.getId())); + } else { + fmPredicates.add(cb.equal(fmRoot.get("dataFile").get("rootDataFileId"), dataFile.getRootDataFileId())); + } + fmQuery.where(fmPredicates.toArray(new Predicate[0])); + + List fileHistory = em.createQuery(fmQuery).getResultList(); + + // Combine results + Map fmMap = fileHistory.stream() + .collect(Collectors.toMap( + fm -> fm.getDatasetVersion().getId(), + Function.identity() + )); + + // Create the final list, looking up the FileMetadata for each version + return datasetVersions.stream() + .map(version -> new VersionedFileMetadata( + version, + fmMap.get(version.getId()) // This will be null if no entry exists for that version ID + )) + .collect(Collectors.toList()); + } + + /** + * Finds the FileMetadata for a given file in the version immediately preceding a specified version. + * + * @param fileMetadata The FileMetadata instance from the current version, used to identify the file's lineage. + * @return The FileMetadata from the immediately prior version, or {@code null} if this is the first version of the file. + */ + public FileMetadata getPreviousFileMetadata(FileMetadata fileMetadata) { + if (fileMetadata == null || fileMetadata.getDataFile() == null) { + return null; + } + + // 1. Get the ID of the file that was replaced. + Long previousId = fileMetadata.getDataFile().getPreviousDataFileId(); + + // If there's no previous ID, this is the first version of the file. + if (previousId == null) { + return null; + } + + CriteriaBuilder cb = em.getCriteriaBuilder(); + CriteriaQuery cq = cb.createQuery(FileMetadata.class); + Root fileMetadataRoot = cq.from(FileMetadata.class); + + // 2. Join FileMetadata to DataFile to access the ID. + Join dataFileJoin = fileMetadataRoot.join("dataFile"); + + // 3. Find the FileMetadata whose DataFile ID matches the previousId. + cq.where(cb.equal(dataFileJoin.get("id"), previousId)); + + // --- Execution --- + TypedQuery query = em.createQuery(cq); + try { + return query.getSingleResult(); + } catch (NoResultException e) { + // If no result is found, return null. + return null; + } + } + public FileMetadata findMostRecentVersionFileIsIn(DataFile file) { if (file == null) { return null; diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetField.java b/src/main/java/edu/harvard/iq/dataverse/DatasetField.java index a735ae7470c..0f47caf256b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetField.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetField.java @@ -197,6 +197,7 @@ public void setDatasetFieldValues(List datasetFieldValues) { @ManyToMany(cascade = {CascadeType.MERGE}) @JoinTable(indexes = {@Index(columnList="datasetfield_id"),@Index(columnList="controlledvocabularyvalues_id")}) + @OrderBy("displayOrder ASC") private List controlledVocabularyValues = new ArrayList<>(); public List getControlledVocabularyValues() { @@ -604,14 +605,15 @@ private DatasetField copy(Object versionOrTemplate, DatasetFieldCompoundValue pa if (versionOrTemplate != null) { if (versionOrTemplate instanceof DatasetVersion) { - dsf.setDatasetVersion((DatasetVersion) versionOrTemplate); + dsf.setDatasetVersion((DatasetVersion) versionOrTemplate); } else { dsf.setTemplate((Template) versionOrTemplate); } } dsf.setParentDatasetFieldCompoundValue(parent); - dsf.setControlledVocabularyValues(controlledVocabularyValues); + + dsf.getControlledVocabularyValues().addAll(controlledVocabularyValues); for (DatasetFieldValue dsfv : datasetFieldValues) { dsf.getDatasetFieldValues().add(dsfv.copy(dsf)); diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java index 0b6b74e6a73..e6b2711b443 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetFieldServiceBean.java @@ -43,6 +43,7 @@ import jakarta.persistence.criteria.*; import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.math.NumberUtils; import org.apache.http.HttpResponse; import org.apache.http.HttpResponseInterceptor; import org.apache.http.client.methods.HttpGet; @@ -52,6 +53,7 @@ import org.apache.http.protocol.HttpContext; import org.apache.http.util.EntityUtils; import edu.harvard.iq.dataverse.settings.SettingsServiceBean; +import edu.harvard.iq.dataverse.util.ListSplitUtil; /** * @@ -764,13 +766,24 @@ Object processPathSegment(int index, String[] pathParts, JsonValue curPath, Stri JsonValue val = jo.get(keyVal[0]); if (val != null) { if (val.getValueType().equals(ValueType.STRING)) { + //Match a string value if (((JsonString) val).getString().equals(expected)) { logger.fine("Found: " + jo); curPath = jo; return processPathSegment(index + 1, pathParts, curPath, termUri); } - } else { - logger.warning("Expected a string value for " + keyVal[0] + " but found: " + val.getValueType()); + } else if (val.getValueType() == JsonValue.ValueType.ARRAY) { + // Match one string in an array + JsonArray jsonArray = (JsonArray) val; + for (JsonValue arrayVal : jsonArray) { + if (arrayVal.getValueType() == JsonValue.ValueType.STRING) { + if (((JsonString) arrayVal).getString().equals(expected)) { + logger.fine("Found match in array: " + jo.toString()); + curPath = jo; + return processPathSegment(index + 1, pathParts, curPath, termUri); + } + } + } } } } @@ -793,8 +806,18 @@ Object processPathSegment(int index, String[] pathParts, JsonValue curPath, Stri } } else { - curPath = ((JsonObject) curPath).get(pathParts[index]); - logger.fine("Found next Path object " + curPath); + if ((curPath instanceof JsonArray) && NumberUtils.isCreatable(pathParts[index])) { + try { + int indexNumber = Integer.parseInt(pathParts[index]); + curPath = ((JsonArray) curPath).get(indexNumber); + } catch (NumberFormatException nfe) { + logger.fine("Please provide a valid integer number " + pathParts[index]); + } + } else { + curPath = ((JsonObject) curPath).get(pathParts[index]); + } + // curPath = ((JsonObject) curPath).get(pathParts[index]); + logger.fine("Found next Path object " + ((curPath == null) ? "null" : curPath.toString())); return processPathSegment(index + 1, pathParts, curPath, termUri); } } else { @@ -886,12 +909,12 @@ public String getFieldLanguage(String languages, String localeCode) { // If the fields list of supported languages contains the current locale (e.g. // the lang of the UI, or the current metadata input/display lang (tbd)), use // that. Otherwise, return the first in the list - String[] langStrings = languages.split("\\s*,\\s*"); - if (langStrings.length > 0) { - if (Arrays.asList(langStrings).contains(localeCode)) { + final List langStrings = ListSplitUtil.split(languages); + if (!langStrings.isEmpty()) { + if (langStrings.contains(localeCode)) { return localeCode; } else { - return langStrings[0]; + return langStrings.get(0); } } return null; diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetLinkingDataverse.java b/src/main/java/edu/harvard/iq/dataverse/DatasetLinkingDataverse.java index dec07a09643..28b061ffa2a 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetLinkingDataverse.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetLinkingDataverse.java @@ -9,6 +9,7 @@ import jakarta.persistence.Id; import jakarta.persistence.Index; import jakarta.persistence.JoinColumn; +import jakarta.persistence.NamedNativeQuery; import jakarta.persistence.NamedQueries; import jakarta.persistence.NamedQuery; import jakarta.persistence.OneToOne; @@ -35,6 +36,18 @@ @NamedQuery(name = "DatasetLinkingDataverse.findIdsByLinkingDataverseId", query = "SELECT o.dataset.id FROM DatasetLinkingDataverse AS o WHERE o.linkingDataverse.id = :linkingDataverseId") }) + + @NamedNativeQuery( + name = "DatasetLinkingDataverse.findByDatasetIdAndLinkingDataverseName", + query = """ + select o.linkingDataverse_id from DatasetLinkingDataverse as o + LEFT JOIN dataverse dv ON dv.id = o.linkingDataverse_id + WHERE o.dataset_id =? AND ((LOWER(dv.name) LIKE ? and ((SUBSTRING(LOWER(dv.name),0,(LENGTH(dv.name)-9)) LIKE ?) + or (SUBSTRING(LOWER(dv.name),0,(LENGTH(dv.name)-9)) LIKE ?))) + or (LOWER(dv.name) NOT LIKE ? and ((LOWER(dv.name) LIKE ?) + or (LOWER(dv.name) LIKE ?))))""" + ) + public class DatasetLinkingDataverse implements Serializable { private static final long serialVersionUID = 1L; @Id diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetLinkingServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/DatasetLinkingServiceBean.java index 39c82bfa3f1..6b0f8af6590 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetLinkingServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetLinkingServiceBean.java @@ -5,6 +5,7 @@ */ package edu.harvard.iq.dataverse; +import jakarta.ejb.EJB; import java.util.ArrayList; import java.util.List; import java.util.logging.Logger; @@ -13,7 +14,6 @@ import jakarta.persistence.EntityManager; import jakarta.persistence.NoResultException; import jakarta.persistence.PersistenceContext; -import jakarta.persistence.Query; import jakarta.persistence.TypedQuery; /** @@ -28,6 +28,9 @@ public class DatasetLinkingServiceBean implements java.io.Serializable { @PersistenceContext(unitName = "VDCNet-ejbPU") private EntityManager em; + @EJB + DataverseServiceBean dataverseService; + public List findLinkedDatasets(Long dataverseId) { @@ -41,13 +44,42 @@ public List findLinkedDatasets(Long dataverseId) { } public List findLinkingDataverses(Long datasetId) { + return findLinkingDataverses(datasetId, ""); + } + + public List findLinkingDataverses(Long datasetId, String searchTerm) { List retList = new ArrayList<>(); - TypedQuery typedQuery = em.createNamedQuery("DatasetLinkingDataverse.findByDatasetId", DatasetLinkingDataverse.class) - .setParameter("datasetId", datasetId); - for (DatasetLinkingDataverse datasetLinkingDataverse : typedQuery.getResultList()) { - retList.add(datasetLinkingDataverse.getLinkingDataverse()); + if (searchTerm == null || searchTerm.isEmpty()) { + TypedQuery typedQuery = em.createNamedQuery("DatasetLinkingDataverse.findByDatasetId", DatasetLinkingDataverse.class) + .setParameter("datasetId", datasetId); + for (DatasetLinkingDataverse datasetLinkingDataverse : typedQuery.getResultList()) { + retList.add(datasetLinkingDataverse.getLinkingDataverse()); + } + return retList; + + } else { + + String pattern = searchTerm.toLowerCase(); + + String pattern1 = pattern + "%"; + String pattern2 = "% " + pattern + "%"; + + // Adjust the queries for very short, 1 and 2-character patterns: + if (pattern.length() == 1) { + pattern1 = pattern; + pattern2 = pattern + " %"; + } + TypedQuery typedQuery + = em.createNamedQuery("DatasetLinkingDataverse.findByDatasetIdAndLinkingDataverseName", Long.class) + .setParameter(1, datasetId).setParameter(2, "%dataverse").setParameter(3, pattern1) + .setParameter(4, pattern2).setParameter(5, "%dataverse").setParameter(6, pattern1).setParameter(7, pattern2); + + for (Long id : typedQuery.getResultList()) { + retList.add(dataverseService.find(id)); + } + return retList; } - return retList; + } public void save(DatasetLinkingDataverse datasetLinkingDataverse) { diff --git a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java index b41e8d4ac35..20617160a1c 100644 --- a/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java +++ b/src/main/java/edu/harvard/iq/dataverse/DatasetPage.java @@ -1,5 +1,8 @@ package edu.harvard.iq.dataverse; +import edu.harvard.iq.dataverse.authorization.DataverseRole; +import edu.harvard.iq.dataverse.engine.command.DataverseRequest; +import edu.harvard.iq.dataverse.globus.Permissions; import edu.harvard.iq.dataverse.provenance.ProvPopupFragmentBean; import edu.harvard.iq.dataverse.api.AbstractApiBean; import edu.harvard.iq.dataverse.authorization.AuthenticationServiceBean; @@ -332,6 +335,7 @@ public enum DisplayMode { private List linkingDVSelectItems; private Dataverse linkingDataverse; private Dataverse selectedHostDataverse; + private Boolean hasDataversesToChoose; public Dataverse getSelectedHostDataverse() { return selectedHostDataverse; @@ -1781,6 +1785,22 @@ public void setDataverseTemplates(List