Skip to content

Commit 3489fda

Browse files
Merge pull request #43 from viralemergence/feature/metadata_wizard
Feature/metadata wizard
2 parents 408dad1 + f3d9e0c commit 3489fda

File tree

174 files changed

+21241
-108
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

174 files changed

+21241
-108
lines changed

.Rbuildignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@
99
^doc$
1010
^Meta$
1111
^codemeta\.json$
12+
^deposit_files$

DESCRIPTION

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: wddsWizard
22
Title: Data Wizard for a Minimal Wildlife Disease Data Standard
3-
Version: 0.1.0
3+
Version: 0.2.0
44
Authors@R:
55
person("Collin",
66
"Schwantes",
@@ -20,6 +20,7 @@ Imports:
2020
here,
2121
jsonlite,
2222
purrr,
23+
readr,
2324
rlang,
2425
snakecase,
2526
stringr,
@@ -32,7 +33,9 @@ Suggests:
3233
readxl,
3334
tibble,
3435
jsonvalidate,
35-
testthat (>= 3.0.0)
36+
testthat (>= 3.0.0),
37+
taxize,
38+
deposits (>= 0.2.1.63)
3639
Config/testthat/edition: 3
3740
VignetteBuilder: knitr
3841
Depends:
@@ -41,3 +44,5 @@ LazyData: true
4144
License: MIT + file LICENSE
4245
URL: https://viralemergence.github.io/wddsWizard/
4346
BugReports: https://github.com/viralemergence/wddsWizard/issues
47+
Remotes:
48+
ropenscilabs/deposits

NAMESPACE

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,16 @@ export(clean_field_names)
55
export(create_schema_docs)
66
export(create_schema_list)
77
export(download_deposit_version)
8+
export(expand_tidy_dfs)
9+
export(extract_metadata_from_doi)
10+
export(extract_metadata_oa)
11+
export(generate_metadata_csv)
12+
export(generate_repeat_dfs)
813
export(get_entity)
914
export(get_required_fields)
1015
export(list_deposit_versions)
16+
export(make_simple_df)
17+
export(na_to_blank)
1118
export(paste_reduce)
1219
export(paste_reduce_ul)
1320
export(prep_affiliation)
@@ -31,8 +38,11 @@ export(prep_rights)
3138
export(prep_subjects)
3239
export(prep_titles)
3340
export(sanitize_version)
41+
export(translate_to_dcmi)
3442
export(use_wdds_template)
3543
export(wdds_data_templates)
3644
export(wdds_example_data)
3745
export(wdds_json)
46+
export(wdds_to_dcmi)
47+
export(wdds_to_pharos)
3848
importFrom(rlang,.data)

NEWS.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
# wddsWizard 0.2.0
2+
3+
* extract metadata from DOIs for projects
4+
* translate between data standards
5+
* generate metadata templates
6+
7+
18
# wddsWizard 0.1.0
29

310
# wddsWizard 0.0.3

R/extract_metadata_from_doi.R

Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
#' Extract Project Metadata from DOI
2+
#'
3+
#' Some works are explicitly connected to a publication and the metadata for that publication are fairly complete.
4+
#' Instead of re-writing the metadata, it would be better to extract it and transform it.
5+
#'
6+
#' @param doi String. DOI for a published work
7+
#' @param file_path String. Where should the output be written?
8+
#' @param write_output Logical. Should the output be written to a file?
9+
#' @family Project Metadata
10+
#'
11+
#' @returns data frame. A data frame structured in the same way as the metadata template csv.
12+
#' @export
13+
#'
14+
#' @examples
15+
#'
16+
#' doi <-"doi.org/10.1038/s41597-025-05332-x"
17+
#' extract_metadata_from_doi(doi = doi,write_output=FALSE)
18+
#'
19+
extract_metadata_from_doi <- function(doi, file_path, write_output = TRUE){
20+
if(!curl::has_internet()){
21+
rlang::abort("An internet connection is required to extract metadata")
22+
}
23+
24+
assertthat::assert_that(assertthat::is.string(doi),msg = "doi must be a non-vector string")
25+
26+
assertthat::assert_that(assertthat::is.scalar(write_output)&is.logical(write_output),msg = "write_output must be logical and scalar")
27+
28+
# make sure DOI is properly formatted
29+
doi <- trimws(doi,"both")
30+
31+
out <- extract_metadata_oa(doi = doi)
32+
# extract_metadata_zenodo(doi = doi)
33+
34+
if(write_output){
35+
assertthat::assert_that(assertthat::is.string(file_path),msg = "file_path must be a non-vector string")
36+
readr::write_csv(x = out,file = file_path)
37+
}
38+
39+
return(out)
40+
}
41+
42+
#' Extract Metadata from Open Alex record
43+
#'
44+
#' Uses the DOI for a work to extract metadata from OpenAlex - https://openalex.org/.
45+
#' The OpenAlex data model does not included some fields that are part
46+
#' of the wdds project metadata related identifiers.
47+
#'
48+
#' Carefully review and edit the metadata produced.
49+
#'
50+
#' We recommend writing the metadata to a csv, editing the csv, then
51+
#' processing it as demonstrated in the project metadata tutorial.
52+
#'
53+
#'
54+
#' @param doi Character. A digital object identifier for a published work.
55+
#'
56+
#' @returns data frame. A data frame structured in the same way as the metadata template CSV.
57+
#' @family Project Metadata
58+
#' @export
59+
#'
60+
#' @examples
61+
#'
62+
#' doi <- "doi.org/10.1038/s41597-025-05332-x"
63+
#' extract_metadata_oa(doi = doi)
64+
#'
65+
extract_metadata_oa<-function(doi){
66+
67+
assertthat::assert_that(assertthat::is.string(doi),msg = "doi must be a non-vector string")
68+
69+
# doi.org/10.1038/s41597-025-05332-x
70+
oa_url <- sprintf("https://api.openalex.org/works/%s",doi)
71+
72+
oa_json <- jsonlite::fromJSON(txt = oa_url)
73+
oa_json$authorships$affiliations
74+
75+
# Name Jane Doe
76+
# Given Name Jane
77+
# Family Name Doe
78+
# Name Identifier https://orcid.org/0000-0003-fake-1111
79+
# Affiliation Department of Biology, State University
80+
# Affiliation Identifier https://ror.org/02aqsfake
81+
82+
## get creators
83+
84+
creators <- oa_json$authorships
85+
86+
# get name
87+
88+
creator_df <- data.frame("Name" = creators$raw_author_name)
89+
creator_df <- creator_df |>
90+
dplyr::mutate("Family Name" = stringr::str_split_i(.data$Name, pattern = " ", -1)) |>
91+
dplyr::mutate("Given Name" = stringr::str_remove(.data$Name, pattern = .data$`Family Name`))
92+
93+
# get identifier
94+
creator_df$`Name Identifier` <- creators$author$orcid
95+
96+
# get affiliation string and identifier
97+
98+
aff_df <- creators$affiliations |>
99+
purrr::map_df(function(x){
100+
raw_affiliation <- x$raw_affiliation_string[[1]]
101+
102+
oa_inst_id <- x$institution_ids[[1]][1] |>
103+
fs::path_file()
104+
105+
oa_inst_api <- sprintf("https://api.openalex.org/institutions/%s", oa_inst_id)
106+
107+
oa_inst_list <- jsonlite::fromJSON(oa_inst_api)
108+
109+
oa_inst_list$ror
110+
111+
out <- data.frame("Affiliation" = raw_affiliation, "Affiliation Identifier" = oa_inst_list$ror)
112+
113+
return(out)
114+
})
115+
116+
creator_df_tidy <- cbind(creator_df,aff_df) |>
117+
dplyr::rename("Affiliation Identifier" = .data$Affiliation.Identifier)
118+
119+
# reshape to template form
120+
creator_df_expanded <- expand_tidy_dfs(creator_df_tidy, group_prefix = "Creators")
121+
122+
# titles
123+
title_df_expanded <- data.frame(title = oa_json$title) |>
124+
expand_tidy_dfs(group_prefix = "Titles")
125+
126+
# publicationYear
127+
publicationYear_df <- make_simple_df(property = "publicationYear",value= oa_json$publication_year)
128+
# language
129+
language_df <- make_simple_df(property = "language", value = oa_json$language)
130+
# description
131+
description_df <- data.frame(Description = "FILL ME IN","Description Type" = "abstract",check.names = FALSE) |>
132+
expand_tidy_dfs(group_prefix = "Descriptions")
133+
134+
# fundingReferences
135+
136+
"Funder Name NSF
137+
Funder Identifier http://dx.doi.org/10.13039/10000fake
138+
Award Number DBI 2515340
139+
Award URI https://www.viralemergence.org/grants
140+
Award Title Verena Fellow-in-Residence Award"
141+
142+
funder_references_tidy <- oa_json$grants |>
143+
dplyr::mutate(oa_funder_id = fs::path_file(.data$funder)) |>
144+
dplyr::mutate(oa_funder_api = sprintf("https://api.openalex.org/funders/%s", .data$oa_funder_id)) |>
145+
dplyr::mutate(funder_identifier = purrr::map_chr(.data$oa_funder_api, function(x){
146+
funder_json <- jsonlite::fromJSON(x)
147+
funder_ids <- funder_json$ids
148+
#use one of ror, crossref doi, or openalex id
149+
ids_ordered <- c("ror","doi","wikidata","openalex")
150+
preferred_id <- which(ids_ordered %in% names(funder_ids))[1]
151+
funder_ids[ids_ordered[preferred_id]][[1]]
152+
})
153+
) |>
154+
dplyr::select(dplyr::all_of(c("funder_display_name","funder_identifier","award_id"))) |>
155+
dplyr::rename("Funder Name" = "funder_display_name",
156+
"Funder Identifier" = "funder_identifier",
157+
"Award Number" = "award_id"
158+
) |>
159+
dplyr::mutate(
160+
"Award URI" = "",
161+
"Award ID" = ""
162+
)
163+
164+
funder_references_df <- expand_tidy_dfs(funder_references_tidy, group_prefix = "Funding References")
165+
166+
167+
# subjects
168+
169+
subjects_df <- data.frame(Subject = oa_json$keywords$display_name) |>
170+
expand_tidy_dfs(group_prefix = 'Subjects')
171+
172+
# Related Identifiers
173+
174+
related_identifiers_tidy <- data.frame("Related Identifier" = "A valid Identifier like a DOI",
175+
"Related Identifier Type" = "see accepted values here https://datacite-metadata-schema.readthedocs.io/en/4.5/appendices/appendix-1/relatedIdentifierType/#relatedidentifiertype",
176+
"Relation Type" = "see accepted values here: https://datacite-metadata-schema.readthedocs.io/en/4.5/appendices/appendix-1/relationType/#relationtype",check.names = FALSE )
177+
178+
related_identifiers_df <- related_identifiers_tidy |>
179+
expand_tidy_dfs(group_prefix = "Related Identifiers")
180+
181+
182+
# rbind data frames
183+
184+
out <- rbind(creator_df_expanded,
185+
title_df_expanded,
186+
publicationYear_df,
187+
language_df,
188+
description_df,
189+
funder_references_df,
190+
subjects_df,
191+
related_identifiers_df
192+
)
193+
194+
return(out)
195+
196+
}
197+
198+
199+
#' Expand tidy dataframes to project metadata template format
200+
#'
201+
#' Creates a JSON-like structure in the csv that can be processed using
202+
#' established workflows in this package.
203+
#'
204+
#' @param tidy_df data frame. Each row corresponds to a complete entry.
205+
#' @param group_prefix character. A repeatable metadata property in the project
206+
#' metadata section of WDDS. See https://viralemergence.github.io/wddsWizard/articles/schema_overview.html#project_metadata
207+
#'
208+
#' @family Project Metadata
209+
#' @returns Data frame. The data frame contains the fields Group, Variable, and Value.
210+
#' @export
211+
#'
212+
#' @examples
213+
#'
214+
#'# a nice tidy dataset
215+
#' creators_tidy <- data.frame("Name" = paste(letters[1:10],LETTERS[1:10]),
216+
#' "Given Name" = letters[1:10],
217+
#' "Family Name" = LETTERS[1:10],
218+
#' "Name Identifier" = sample(1:100,10,FALSE),
219+
#' "Affiliation" = letters[11:20],
220+
#' "Affiliation Identifier" = 11:20,
221+
#' check.names =FALSE)
222+
#'
223+
#'# an expanded dataset that matches the template format.
224+
#' creators_tidy |>
225+
#' expand_tidy_dfs(group_prefix = "Creators")
226+
#'
227+
#'
228+
#'
229+
expand_tidy_dfs <- function(tidy_df,group_prefix){
230+
231+
assertthat::assert_that(assertthat::is.string(group_prefix),msg = "group_prefix must be character and length 1")
232+
# number of groups
233+
num_groups <- nrow(tidy_df)
234+
235+
# group of variables
236+
group_variables <- names(tidy_df)
237+
238+
df_out <- df_out <- data.frame(Group = character(0),
239+
Variable = character(0),
240+
Value = character(0))
241+
242+
for(i in 1:num_groups){
243+
values <- tidy_df[i,] |> unlist()
244+
names(values) <- NULL
245+
group_name <- sprintf("%s %s",group_prefix,i)
246+
Group <- c(group_name, rep("",(length(group_variables)-1)))
247+
df_i <- data.frame(Group = Group, Variable = group_variables, Value = values)
248+
df_out <- rbind(df_out, df_i)
249+
}
250+
251+
return(df_out)
252+
}
253+
254+
#' A convenience function for making non-repeating items
255+
#'
256+
#' @param property string. Metadata group and variable name
257+
#' @param value A value for that property.
258+
#'
259+
#' @family Project Metadata
260+
#'
261+
#' @returns data frame. A data frame that conforms to non-repeatable structure in template.
262+
#' @export
263+
#'
264+
#' @examples
265+
#' language_df <- make_simple_df(property = "language", value = "fr")
266+
#'
267+
make_simple_df <- function(property,value){
268+
assertthat::assert_that(assertthat::is.string(property),msg = "property must be length 1 and type character")
269+
270+
assertthat::assert_that(assertthat::is.scalar(value),msg = "value must be length 1")
271+
272+
out <- data.frame(Group = property,Variable = property, Value= value)
273+
return(out)
274+
}

0 commit comments

Comments
 (0)