From c6f6df3c51a90e55e32083b6a49e3f3b27be42ff Mon Sep 17 00:00:00 2001 From: nomisto Date: Tue, 12 Apr 2022 10:03:15 +0200 Subject: [PATCH 1/4] Initial mediqa ans dataset --- biodatasets/mediqa_ans/mediqa_ans.py | 263 +++++++++++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 biodatasets/mediqa_ans/mediqa_ans.py diff --git a/biodatasets/mediqa_ans/mediqa_ans.py b/biodatasets/mediqa_ans/mediqa_ans.py new file mode 100644 index 00000000..58579941 --- /dev/null +++ b/biodatasets/mediqa_ans/mediqa_ans.py @@ -0,0 +1,263 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health +Questions. The first summarization collection containing question-driven summaries of answers to consumer health +questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using +extractive or abstractive approaches.""" + +import itertools as it +import json +import os +from typing import List, Tuple, Dict + +import datasets +from utils import schemas +from utils.configs import BigBioConfig +from utils.constants import Tasks + +_CITATION = """\ +@article{, + author={Savery, Max + and Abacha, Asma Ben + and Gayen, Soumya + and Demner-Fushman, Dina}, + title={Question-driven summarization of answers to consumer health questions}, + journal={Scientific Data}, + year={2020}, + month={Oct}, + day={02}, + volume={7}, + number={1}, + pages={322}, + issn={2052-4463}, + doi={10.1038/s41597-020-00667-z}, + url={https://doi.org/10.1038/s41597-020-00667-z} +} +""" + +_DATASETNAME = "mediqa_ans" + +_DESCRIPTION = """\ +Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health +Questions. The first summarization collection containing question-driven summaries of answers to consumer health +questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using +extractive or abstractive approaches. +""" + +_HOMEPAGE = "https://osf.io/fyg46/" + +_LICENSE = "CC0" + +_URLS = { + _DATASETNAME: "https://osf.io/fs57e/download", +} + +_SUPPORTED_TASKS = [Tasks.SUMMARIZATION] + +_SOURCE_VERSION = "1.0.0" + +_BIGBIO_VERSION = "1.0.0" + +class MediqaAnsDataset(datasets.GeneratorBasedBuilder): + """A dataset of manually generated, question-driven summaries of multi and single document answers to consumer health questions.""" + + SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) + BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) + + BUILDER_CONFIGS = [] + + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"mediqa_ans_all_source", + version=BIGBIO_VERSION, + description=f"MEDIQA-AnS All source schema", + schema="source", + subset_id=f"mediqa_ans_all", + ), + ) + + + for setting1 in ["page2answer", "section2answer"]: + for setting2 in ["multi", "single"]: + for setting3 in ["abstractive", "extractive"]: + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_bigbio_t2t", + version=BIGBIO_VERSION, + description=f"MEDIQA-AnS {setting1} {setting2.capitalize()} {setting3.capitalize()} BigBio schema", + schema="bigbio_t2t", + subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}", + ) + ) + BUILDER_CONFIGS.append( + BigBioConfig( + name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_source", + version=BIGBIO_VERSION, + description=f"MEDIQA-AnS {setting1} {setting2.capitalize()} {setting3.capitalize()} source schema", + schema="source", + subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}", + ), + ) + + DEFAULT_CONFIG_NAME = "mediqa_ans_page2answer_multi_abstractive_source" + + def _info(self) -> datasets.DatasetInfo: + + if self.config.schema == "source" and self.config.subset_id == "mediqa_ans_all": + features = datasets.Features( + { + "id": datasets.Value("string"), + "question": datasets.Value("string"), + "multi_abs_summ": datasets.Value("string"), + "multi_ext_summ": datasets.Value("string"), + "answers": [ + { + "id": datasets.Value("string"), + "answer_abs_summ": datasets.Value("string"), + "answer_ext_summ": datasets.Value("string"), + "section": datasets.Value("string"), + "article": datasets.Value("string"), + "url": datasets.Value("string"), + "rating": datasets.Value("string"), + } + ], + } + ) + elif self.config.schema == "source": + features = datasets.Features( + { + "question": datasets.Value("string"), + "question_id": datasets.Value("string"), + "summary": datasets.Value("string"), + "articles": [{ + "answer_id": datasets.Value("string"), + "text": datasets.Value("string"), + "rating": datasets.Value("string"), + }] + } + ) + elif self.config.schema == "bigbio_t2t": + features = schemas.text2text_features + + return datasets.DatasetInfo( + description=_DESCRIPTION, + features=features, + homepage=_HOMEPAGE, + license=_LICENSE, + citation=_CITATION, + ) + + def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: + """Returns SplitGenerators.""" + + urls = _URLS[_DATASETNAME] + file_path = dl_manager.download_and_extract(urls) + + return [ + datasets.SplitGenerator( + name=datasets.Split.TRAIN, + gen_kwargs={ + "filepath": os.path.join(file_path), + }, + ), + ] + + def _generate_examples(self, filepath) -> Tuple[int, Dict]: + """Yields examples as (key, example) tuples.""" + + dataset = None + with open(filepath, "r", encoding="utf8") as infile: + dataset = json.load(infile) + + + + uid = it.count(0) + if self.config.name == "mediqa_ans_all_source": + dataset = self._json_dict_to_list(dataset, "id") + for example in dataset: + example["answers"] = self._json_dict_to_list(example["answers"], "id") + yield example["id"], example + else: + _, setting1, setting2, setting3 = self.config.subset_id.rsplit("_",3) + if self.config.schema == "source": + for example in self._generate_setting_examples(dataset, setting1, setting2, setting3): + yield next(uid), example + elif self.config.schema == "bigbio_t2t": + for example in self._generate_setting_examples(dataset, setting1, setting2, setting3): + example = self._source_to_t2t(example) + example["id"] = next(uid) + yield example["id"], example + + def _generate_setting_examples(self, dataset, setting1, setting2, setting3): + for question_id, question in dataset.items(): + example = {} + example["question_id"] = question_id + example["question"] = question["question"] + if setting2 == "single": + for answer_id, answer in question["answers"].items(): + example_ = example.copy() + if setting1 == "section2answer": + example_["articles"] = [{"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]}] + elif setting1 == "page2answer": + example_["articles"] = [{"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]}] + if setting3 == "abstractive": + example_["summary"] = answer["answer_abs_summ"] + elif setting3 == "extractive": + example_["summary"] = answer["answer_ext_summ"] + yield example_ + elif setting2 == "multi": + example["articles"] = [] + for answer_id, answer in question["answers"].items(): + if setting1 == "section2answer": + example["articles"].append({"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]}) + elif setting1 == "page2answer": + example["articles"].append({"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]}) + + if setting3 == "abstractive": + example["summary"] = question["multi_abs_summ"] + elif setting3 == "extractive": + example["summary"] = question["multi_ext_summ"] + yield example + + def _source_to_t2t(self, example): + example_ = {} + example_["document_id"] = "" + example_["text_1_name"] = "" + example_["text_2_name"] = "" + + text1 = "" + text1 += "Question ID: " + example["question_id"] + "\n" + text1 += "Question: " + example["question"] + "\n" + for article in example["articles"]: + text1 += "Answer ID: " + article["answer_id"] + "\n" + text1 += "Text: " + article["text"] + "\n" + text1 += "Rating: " + article["rating"] + "\n" + example_["text_1"] = text1 + + example_["text_2"] = example["summary"] + + return example_ + + def _json_dict_to_list(self, json, new_key): + list_ = [] + for key, values in json.items(): + assert isinstance(values, dict), "Child element is not a dict" + assert (new_key not in values), "New key already in values" + values[new_key] = key + list_.append(values) + return list_ + From 3090d0681e3849099c2c80cf6c85c4c17d79a95c Mon Sep 17 00:00:00 2001 From: nomisto Date: Tue, 12 Apr 2022 10:19:19 +0200 Subject: [PATCH 2/4] changed label 'Text' to 'Answer' in text_1 --- biodatasets/mediqa_ans/mediqa_ans.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/biodatasets/mediqa_ans/mediqa_ans.py b/biodatasets/mediqa_ans/mediqa_ans.py index 58579941..848ec61c 100644 --- a/biodatasets/mediqa_ans/mediqa_ans.py +++ b/biodatasets/mediqa_ans/mediqa_ans.py @@ -244,7 +244,7 @@ def _source_to_t2t(self, example): text1 += "Question: " + example["question"] + "\n" for article in example["articles"]: text1 += "Answer ID: " + article["answer_id"] + "\n" - text1 += "Text: " + article["text"] + "\n" + text1 += "Answer: " + article["text"] + "\n" text1 += "Rating: " + article["rating"] + "\n" example_["text_1"] = text1 From 30339253732d518e8f9048201edc420b59652822 Mon Sep 17 00:00:00 2001 From: nomisto Date: Tue, 12 Apr 2022 10:49:08 +0200 Subject: [PATCH 3/4] reformat --- biodatasets/mediqa_ans/mediqa_ans.py | 77 ++++++++++++++++------------ 1 file changed, 44 insertions(+), 33 deletions(-) diff --git a/biodatasets/mediqa_ans/mediqa_ans.py b/biodatasets/mediqa_ans/mediqa_ans.py index 848ec61c..e1fa5d97 100644 --- a/biodatasets/mediqa_ans/mediqa_ans.py +++ b/biodatasets/mediqa_ans/mediqa_ans.py @@ -14,17 +14,18 @@ # limitations under the License. """ -Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health -Questions. The first summarization collection containing question-driven summaries of answers to consumer health -questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using +Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health +Questions. The first summarization collection containing question-driven summaries of answers to consumer health +questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using extractive or abstractive approaches.""" import itertools as it import json import os -from typing import List, Tuple, Dict +from typing import Dict, List, Tuple import datasets + from utils import schemas from utils.configs import BigBioConfig from utils.constants import Tasks @@ -52,9 +53,9 @@ _DATASETNAME = "mediqa_ans" _DESCRIPTION = """\ -Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health -Questions. The first summarization collection containing question-driven summaries of answers to consumer health -questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using +Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health +Questions. The first summarization collection containing question-driven summaries of answers to consumer health +questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using extractive or abstractive approaches. """ @@ -72,8 +73,12 @@ _BIGBIO_VERSION = "1.0.0" + class MediqaAnsDataset(datasets.GeneratorBasedBuilder): - """A dataset of manually generated, question-driven summaries of multi and single document answers to consumer health questions.""" + """ + A dataset of manually generated, question-driven summaries of multi and + single document answers to consumer health questions. + """ SOURCE_VERSION = datasets.Version(_SOURCE_VERSION) BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION) @@ -82,15 +87,14 @@ class MediqaAnsDataset(datasets.GeneratorBasedBuilder): BUILDER_CONFIGS.append( BigBioConfig( - name=f"mediqa_ans_all_source", + name="mediqa_ans_all_source", version=BIGBIO_VERSION, - description=f"MEDIQA-AnS All source schema", + description="MEDIQA-AnS All source schema", schema="source", - subset_id=f"mediqa_ans_all", + subset_id="mediqa_ans_all", ), ) - for setting1 in ["page2answer", "section2answer"]: for setting2 in ["multi", "single"]: for setting3 in ["abstractive", "extractive"]: @@ -98,7 +102,7 @@ class MediqaAnsDataset(datasets.GeneratorBasedBuilder): BigBioConfig( name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_bigbio_t2t", version=BIGBIO_VERSION, - description=f"MEDIQA-AnS {setting1} {setting2.capitalize()} {setting3.capitalize()} BigBio schema", + description=f"MEDIQA-AnS {setting1} {setting2} {setting3} BigBio schema", schema="bigbio_t2t", subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}", ) @@ -107,7 +111,7 @@ class MediqaAnsDataset(datasets.GeneratorBasedBuilder): BigBioConfig( name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_source", version=BIGBIO_VERSION, - description=f"MEDIQA-AnS {setting1} {setting2.capitalize()} {setting3.capitalize()} source schema", + description=f"MEDIQA-AnS {setting1} {setting2} {setting3} source schema", schema="source", subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}", ), @@ -126,7 +130,7 @@ def _info(self) -> datasets.DatasetInfo: "multi_ext_summ": datasets.Value("string"), "answers": [ { - "id": datasets.Value("string"), + "id": datasets.Value("string"), "answer_abs_summ": datasets.Value("string"), "answer_ext_summ": datasets.Value("string"), "section": datasets.Value("string"), @@ -143,11 +147,13 @@ def _info(self) -> datasets.DatasetInfo: "question": datasets.Value("string"), "question_id": datasets.Value("string"), "summary": datasets.Value("string"), - "articles": [{ - "answer_id": datasets.Value("string"), - "text": datasets.Value("string"), - "rating": datasets.Value("string"), - }] + "articles": [ + { + "answer_id": datasets.Value("string"), + "text": datasets.Value("string"), + "rating": datasets.Value("string"), + } + ], } ) elif self.config.schema == "bigbio_t2t": @@ -163,7 +169,7 @@ def _info(self) -> datasets.DatasetInfo: def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: """Returns SplitGenerators.""" - + urls = _URLS[_DATASETNAME] file_path = dl_manager.download_and_extract(urls) @@ -178,13 +184,11 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]: def _generate_examples(self, filepath) -> Tuple[int, Dict]: """Yields examples as (key, example) tuples.""" - + dataset = None with open(filepath, "r", encoding="utf8") as infile: dataset = json.load(infile) - - uid = it.count(0) if self.config.name == "mediqa_ans_all_source": dataset = self._json_dict_to_list(dataset, "id") @@ -192,7 +196,7 @@ def _generate_examples(self, filepath) -> Tuple[int, Dict]: example["answers"] = self._json_dict_to_list(example["answers"], "id") yield example["id"], example else: - _, setting1, setting2, setting3 = self.config.subset_id.rsplit("_",3) + _, setting1, setting2, setting3 = self.config.subset_id.rsplit("_", 3) if self.config.schema == "source": for example in self._generate_setting_examples(dataset, setting1, setting2, setting3): yield next(uid), example @@ -211,9 +215,13 @@ def _generate_setting_examples(self, dataset, setting1, setting2, setting3): for answer_id, answer in question["answers"].items(): example_ = example.copy() if setting1 == "section2answer": - example_["articles"] = [{"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]}] + example_["articles"] = [ + {"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]} + ] elif setting1 == "page2answer": - example_["articles"] = [{"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]}] + example_["articles"] = [ + {"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]} + ] if setting3 == "abstractive": example_["summary"] = answer["answer_abs_summ"] elif setting3 == "extractive": @@ -223,16 +231,20 @@ def _generate_setting_examples(self, dataset, setting1, setting2, setting3): example["articles"] = [] for answer_id, answer in question["answers"].items(): if setting1 == "section2answer": - example["articles"].append({"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]}) + example["articles"].append( + {"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]} + ) elif setting1 == "page2answer": - example["articles"].append({"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]}) + example["articles"].append( + {"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]} + ) if setting3 == "abstractive": example["summary"] = question["multi_abs_summ"] elif setting3 == "extractive": example["summary"] = question["multi_ext_summ"] yield example - + def _source_to_t2t(self, example): example_ = {} example_["document_id"] = "" @@ -249,15 +261,14 @@ def _source_to_t2t(self, example): example_["text_1"] = text1 example_["text_2"] = example["summary"] - + return example_ def _json_dict_to_list(self, json, new_key): list_ = [] for key, values in json.items(): assert isinstance(values, dict), "Child element is not a dict" - assert (new_key not in values), "New key already in values" + assert new_key not in values, "New key already in values" values[new_key] = key list_.append(values) return list_ - From b07671c2cc7e7910cab1674b34b09f473b542b2b Mon Sep 17 00:00:00 2001 From: Simon Ott Date: Tue, 26 Apr 2022 09:56:56 +0200 Subject: [PATCH 4/4] Added description of subsets --- biodatasets/mediqa_ans/mediqa_ans.py | 77 +++++++++++++++++++++++++++- 1 file changed, 76 insertions(+), 1 deletion(-) diff --git a/biodatasets/mediqa_ans/mediqa_ans.py b/biodatasets/mediqa_ans/mediqa_ans.py index e1fa5d97..55d627f9 100644 --- a/biodatasets/mediqa_ans/mediqa_ans.py +++ b/biodatasets/mediqa_ans/mediqa_ans.py @@ -17,7 +17,45 @@ Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health Questions. The first summarization collection containing question-driven summaries of answers to consumer health questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using -extractive or abstractive approaches.""" +extractive or abstractive approaches. + +This dataset contains 8 different subsets which support both source and bigbio schema. These subsets arise from +the permutation of three different settings: +- [page2answer, section2answer]: Full page as context / manually selected passages as context +- [multi, single]: Generate summary of answer for specific question across multiple documents / + Generate summary of answer for specific question for each document (each document is sample) +- [abstractive, extractive]: Abstractive summary / extractive summary + + +List of subset_ids: +- mediqa_ans_page2answer_multi_abstractive + This split contains the question ID and question, the answer ID and the full text of the web pages, the + corresponding rating for each answer, and the multi-document abstractive summary. +- mediqa_ans_page2answer_multi_extractive + Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating + for each answer, and the multi-document extractive summary. +- mediqa_ans_page2answer_single_abstractive + Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating + for each answer, and the single document abstractive summary for each answer. +- mediqa_ans_page2answer_single_extractive + Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating + for each answer, and the single document extractive summary for each answer. +- mediqa_ans_section2answer_multi_abstractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the multi-document abstractive summary for each answer. +- mediqa_ans_section2answer_multi_extractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the multi-document extractive summary for each answer. +- mediqa_ans_section2answer_single_abstractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the single document abstractive summary for each answer. +- mediqa_ans_section2answer_single_extractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the single document extractive summary for each answer. + +Furthermore there exists the subset mediqa_ans_all for which there only exists a source schema and contains +all questions, pages, passages, ratings, urls, and each type of summaries. +""" import itertools as it import json @@ -57,6 +95,43 @@ Questions. The first summarization collection containing question-driven summaries of answers to consumer health questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using extractive or abstractive approaches. + +This dataset contains 8 different subsets which support both source and bigbio schema. These subsets arise from +the permutation of three different settings: +- [page2answer, section2answer]: Full page as context / manually selected passages as context +- [multi, single]: Generate summary of answer for specific question across multiple documents / + Generate summary of answer for specific question for each document (each document is sample) +- [abstractive, extractive]: Abstractive summary / extractive summary + + +List of subset_ids: +- mediqa_ans_page2answer_multi_abstractive + This split contains the question ID and question, the answer ID and the full text of the web pages, the + corresponding rating for each answer, and the multi-document abstractive summary. +- mediqa_ans_page2answer_multi_extractive + Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating + for each answer, and the multi-document extractive summary. +- mediqa_ans_page2answer_single_abstractive + Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating + for each answer, and the single document abstractive summary for each answer. +- mediqa_ans_page2answer_single_extractive + Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating + for each answer, and the single document extractive summary for each answer. +- mediqa_ans_section2answer_multi_abstractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the multi-document abstractive summary for each answer. +- mediqa_ans_section2answer_multi_extractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the multi-document extractive summary for each answer. +- mediqa_ans_section2answer_single_abstractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the single document abstractive summary for each answer. +- mediqa_ans_section2answer_single_extractive + Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating + for each answer, and the single document extractive summary for each answer. + +Furthermore there exists the subset mediqa_ans_all for which there only exists a source schema and contains +all questions, pages, passages, ratings, urls, and each type of summaries. """ _HOMEPAGE = "https://osf.io/fyg46/"