|
| 1 | +#' Extract Project Metadata from DOI |
| 2 | +#' |
| 3 | +#' Some works are explicitly connected to a publication and the metadata for that publication are fairly complete. |
| 4 | +#' Instead of re-writing the metadata, it would be better to extract it and transform it. |
| 5 | +#' |
| 6 | +#' @param doi String. DOI for a published work |
| 7 | +#' @param file_path String. Where should the output be written? |
| 8 | +#' @param write_output Logical. Should the output be written to a file? |
| 9 | +#' @family Project Metadata |
| 10 | +#' |
| 11 | +#' @returns data frame. A data frame structured in the same way as the metadata template csv. |
| 12 | +#' @export |
| 13 | +#' |
| 14 | +#' @examples |
| 15 | +#' |
| 16 | +#' doi <-"doi.org/10.1038/s41597-025-05332-x" |
| 17 | +#' extract_metadata_from_doi(doi = doi,write_output=FALSE) |
| 18 | +#' |
| 19 | +extract_metadata_from_doi <- function(doi, file_path, write_output = TRUE){ |
| 20 | + if(!curl::has_internet()){ |
| 21 | + rlang::abort("An internet connection is required to extract metadata") |
| 22 | + } |
| 23 | + |
| 24 | + assertthat::assert_that(assertthat::is.string(doi),msg = "doi must be a non-vector string") |
| 25 | + |
| 26 | + assertthat::assert_that(assertthat::is.scalar(write_output)&is.logical(write_output),msg = "write_output must be logical and scalar") |
| 27 | + |
| 28 | + # make sure DOI is properly formatted |
| 29 | + doi <- trimws(doi,"both") |
| 30 | + |
| 31 | + out <- extract_metadata_oa(doi = doi) |
| 32 | + # extract_metadata_zenodo(doi = doi) |
| 33 | + |
| 34 | + if(write_output){ |
| 35 | + assertthat::assert_that(assertthat::is.string(file_path),msg = "file_path must be a non-vector string") |
| 36 | + readr::write_csv(x = out,file = file_path) |
| 37 | + } |
| 38 | + |
| 39 | + return(out) |
| 40 | +} |
| 41 | + |
| 42 | +#' Extract Metadata from Open Alex record |
| 43 | +#' |
| 44 | +#' Uses the DOI for a work to extract metadata from OpenAlex - https://openalex.org/. |
| 45 | +#' The OpenAlex data model does not included some fields that are part |
| 46 | +#' of the wdds project metadata related identifiers. |
| 47 | +#' |
| 48 | +#' Carefully review and edit the metadata produced. |
| 49 | +#' |
| 50 | +#' We recommend writing the metadata to a csv, editing the csv, then |
| 51 | +#' processing it as demonstrated in the project metadata tutorial. |
| 52 | +#' |
| 53 | +#' |
| 54 | +#' @param doi Character. A digital object identifier for a published work. |
| 55 | +#' |
| 56 | +#' @returns data frame. A data frame structured in the same way as the metadata template CSV. |
| 57 | +#' @family Project Metadata |
| 58 | +#' @export |
| 59 | +#' |
| 60 | +#' @examples |
| 61 | +#' |
| 62 | +#' doi <- "doi.org/10.1038/s41597-025-05332-x" |
| 63 | +#' extract_metadata_oa(doi = doi) |
| 64 | +#' |
| 65 | +extract_metadata_oa<-function(doi){ |
| 66 | + |
| 67 | + assertthat::assert_that(assertthat::is.string(doi),msg = "doi must be a non-vector string") |
| 68 | + |
| 69 | + # doi.org/10.1038/s41597-025-05332-x |
| 70 | + oa_url <- sprintf("https://api.openalex.org/works/%s",doi) |
| 71 | + |
| 72 | + oa_json <- jsonlite::fromJSON(txt = oa_url) |
| 73 | + oa_json$authorships$affiliations |
| 74 | + |
| 75 | + # Name Jane Doe |
| 76 | + # Given Name Jane |
| 77 | + # Family Name Doe |
| 78 | + # Name Identifier https://orcid.org/0000-0003-fake-1111 |
| 79 | + # Affiliation Department of Biology, State University |
| 80 | + # Affiliation Identifier https://ror.org/02aqsfake |
| 81 | + |
| 82 | + ## get creators |
| 83 | + |
| 84 | + creators <- oa_json$authorships |
| 85 | + |
| 86 | + # get name |
| 87 | + |
| 88 | + creator_df <- data.frame("Name" = creators$raw_author_name) |
| 89 | + creator_df <- creator_df |> |
| 90 | + dplyr::mutate("Family Name" = stringr::str_split_i(.data$Name, pattern = " ", -1)) |> |
| 91 | + dplyr::mutate("Given Name" = stringr::str_remove(.data$Name, pattern = .data$`Family Name`)) |
| 92 | + |
| 93 | + # get identifier |
| 94 | + creator_df$`Name Identifier` <- creators$author$orcid |
| 95 | + |
| 96 | + # get affiliation string and identifier |
| 97 | + |
| 98 | + aff_df <- creators$affiliations |> |
| 99 | + purrr::map_df(function(x){ |
| 100 | + raw_affiliation <- x$raw_affiliation_string[[1]] |
| 101 | + |
| 102 | + oa_inst_id <- x$institution_ids[[1]][1] |> |
| 103 | + fs::path_file() |
| 104 | + |
| 105 | + oa_inst_api <- sprintf("https://api.openalex.org/institutions/%s", oa_inst_id) |
| 106 | + |
| 107 | + oa_inst_list <- jsonlite::fromJSON(oa_inst_api) |
| 108 | + |
| 109 | + oa_inst_list$ror |
| 110 | + |
| 111 | + out <- data.frame("Affiliation" = raw_affiliation, "Affiliation Identifier" = oa_inst_list$ror) |
| 112 | + |
| 113 | + return(out) |
| 114 | + }) |
| 115 | + |
| 116 | + creator_df_tidy <- cbind(creator_df,aff_df) |> |
| 117 | + dplyr::rename("Affiliation Identifier" = .data$Affiliation.Identifier) |
| 118 | + |
| 119 | + # reshape to template form |
| 120 | + creator_df_expanded <- expand_tidy_dfs(creator_df_tidy, group_prefix = "Creators") |
| 121 | + |
| 122 | + # titles |
| 123 | + title_df_expanded <- data.frame(title = oa_json$title) |> |
| 124 | + expand_tidy_dfs(group_prefix = "Titles") |
| 125 | + |
| 126 | + # publicationYear |
| 127 | + publicationYear_df <- make_simple_df(property = "publicationYear",value= oa_json$publication_year) |
| 128 | + # language |
| 129 | + language_df <- make_simple_df(property = "language", value = oa_json$language) |
| 130 | + # description |
| 131 | + description_df <- data.frame(Description = "FILL ME IN","Description Type" = "abstract",check.names = FALSE) |> |
| 132 | + expand_tidy_dfs(group_prefix = "Descriptions") |
| 133 | + |
| 134 | + # fundingReferences |
| 135 | + |
| 136 | + "Funder Name NSF |
| 137 | +Funder Identifier http://dx.doi.org/10.13039/10000fake |
| 138 | +Award Number DBI 2515340 |
| 139 | +Award URI https://www.viralemergence.org/grants |
| 140 | +Award Title Verena Fellow-in-Residence Award" |
| 141 | + |
| 142 | + funder_references_tidy <- oa_json$grants |> |
| 143 | + dplyr::mutate(oa_funder_id = fs::path_file(.data$funder)) |> |
| 144 | + dplyr::mutate(oa_funder_api = sprintf("https://api.openalex.org/funders/%s", .data$oa_funder_id)) |> |
| 145 | + dplyr::mutate(funder_identifier = purrr::map_chr(.data$oa_funder_api, function(x){ |
| 146 | + funder_json <- jsonlite::fromJSON(x) |
| 147 | + funder_ids <- funder_json$ids |
| 148 | + #use one of ror, crossref doi, or openalex id |
| 149 | + ids_ordered <- c("ror","doi","wikidata","openalex") |
| 150 | + preferred_id <- which(ids_ordered %in% names(funder_ids))[1] |
| 151 | + funder_ids[ids_ordered[preferred_id]][[1]] |
| 152 | + }) |
| 153 | + ) |> |
| 154 | + dplyr::select(dplyr::all_of(c("funder_display_name","funder_identifier","award_id"))) |> |
| 155 | + dplyr::rename("Funder Name" = "funder_display_name", |
| 156 | + "Funder Identifier" = "funder_identifier", |
| 157 | + "Award Number" = "award_id" |
| 158 | + ) |> |
| 159 | + dplyr::mutate( |
| 160 | + "Award URI" = "", |
| 161 | + "Award ID" = "" |
| 162 | + ) |
| 163 | + |
| 164 | + funder_references_df <- expand_tidy_dfs(funder_references_tidy, group_prefix = "Funding References") |
| 165 | + |
| 166 | + |
| 167 | + # subjects |
| 168 | + |
| 169 | + subjects_df <- data.frame(Subject = oa_json$keywords$display_name) |> |
| 170 | + expand_tidy_dfs(group_prefix = 'Subjects') |
| 171 | + |
| 172 | + # Related Identifiers |
| 173 | + |
| 174 | + related_identifiers_tidy <- data.frame("Related Identifier" = "A valid Identifier like a DOI", |
| 175 | + "Related Identifier Type" = "see accepted values here https://datacite-metadata-schema.readthedocs.io/en/4.5/appendices/appendix-1/relatedIdentifierType/#relatedidentifiertype", |
| 176 | + "Relation Type" = "see accepted values here: https://datacite-metadata-schema.readthedocs.io/en/4.5/appendices/appendix-1/relationType/#relationtype",check.names = FALSE ) |
| 177 | + |
| 178 | + related_identifiers_df <- related_identifiers_tidy |> |
| 179 | + expand_tidy_dfs(group_prefix = "Related Identifiers") |
| 180 | + |
| 181 | + |
| 182 | + # rbind data frames |
| 183 | + |
| 184 | + out <- rbind(creator_df_expanded, |
| 185 | + title_df_expanded, |
| 186 | + publicationYear_df, |
| 187 | + language_df, |
| 188 | + description_df, |
| 189 | + funder_references_df, |
| 190 | + subjects_df, |
| 191 | + related_identifiers_df |
| 192 | + ) |
| 193 | + |
| 194 | + return(out) |
| 195 | + |
| 196 | +} |
| 197 | + |
| 198 | + |
| 199 | +#' Expand tidy dataframes to project metadata template format |
| 200 | +#' |
| 201 | +#' Creates a JSON-like structure in the csv that can be processed using |
| 202 | +#' established workflows in this package. |
| 203 | +#' |
| 204 | +#' @param tidy_df data frame. Each row corresponds to a complete entry. |
| 205 | +#' @param group_prefix character. A repeatable metadata property in the project |
| 206 | +#' metadata section of WDDS. See https://viralemergence.github.io/wddsWizard/articles/schema_overview.html#project_metadata |
| 207 | +#' |
| 208 | +#' @family Project Metadata |
| 209 | +#' @returns Data frame. The data frame contains the fields Group, Variable, and Value. |
| 210 | +#' @export |
| 211 | +#' |
| 212 | +#' @examples |
| 213 | +#' |
| 214 | +#'# a nice tidy dataset |
| 215 | +#' creators_tidy <- data.frame("Name" = paste(letters[1:10],LETTERS[1:10]), |
| 216 | +#' "Given Name" = letters[1:10], |
| 217 | +#' "Family Name" = LETTERS[1:10], |
| 218 | +#' "Name Identifier" = sample(1:100,10,FALSE), |
| 219 | +#' "Affiliation" = letters[11:20], |
| 220 | +#' "Affiliation Identifier" = 11:20, |
| 221 | +#' check.names =FALSE) |
| 222 | +#' |
| 223 | +#'# an expanded dataset that matches the template format. |
| 224 | +#' creators_tidy |> |
| 225 | +#' expand_tidy_dfs(group_prefix = "Creators") |
| 226 | +#' |
| 227 | +#' |
| 228 | +#' |
| 229 | +expand_tidy_dfs <- function(tidy_df,group_prefix){ |
| 230 | + |
| 231 | + assertthat::assert_that(assertthat::is.string(group_prefix),msg = "group_prefix must be character and length 1") |
| 232 | + # number of groups |
| 233 | + num_groups <- nrow(tidy_df) |
| 234 | + |
| 235 | + # group of variables |
| 236 | + group_variables <- names(tidy_df) |
| 237 | + |
| 238 | + df_out <- df_out <- data.frame(Group = character(0), |
| 239 | + Variable = character(0), |
| 240 | + Value = character(0)) |
| 241 | + |
| 242 | + for(i in 1:num_groups){ |
| 243 | + values <- tidy_df[i,] |> unlist() |
| 244 | + names(values) <- NULL |
| 245 | + group_name <- sprintf("%s %s",group_prefix,i) |
| 246 | + Group <- c(group_name, rep("",(length(group_variables)-1))) |
| 247 | + df_i <- data.frame(Group = Group, Variable = group_variables, Value = values) |
| 248 | + df_out <- rbind(df_out, df_i) |
| 249 | + } |
| 250 | + |
| 251 | + return(df_out) |
| 252 | +} |
| 253 | + |
| 254 | +#' A convenience function for making non-repeating items |
| 255 | +#' |
| 256 | +#' @param property string. Metadata group and variable name |
| 257 | +#' @param value A value for that property. |
| 258 | +#' |
| 259 | +#' @family Project Metadata |
| 260 | +#' |
| 261 | +#' @returns data frame. A data frame that conforms to non-repeatable structure in template. |
| 262 | +#' @export |
| 263 | +#' |
| 264 | +#' @examples |
| 265 | +#' language_df <- make_simple_df(property = "language", value = "fr") |
| 266 | +#' |
| 267 | +make_simple_df <- function(property,value){ |
| 268 | + assertthat::assert_that(assertthat::is.string(property),msg = "property must be length 1 and type character") |
| 269 | + |
| 270 | + assertthat::assert_that(assertthat::is.scalar(value),msg = "value must be length 1") |
| 271 | + |
| 272 | + out <- data.frame(Group = property,Variable = property, Value= value) |
| 273 | + return(out) |
| 274 | +} |
0 commit comments