From c6f6df3c51a90e55e32083b6a49e3f3b27be42ff Mon Sep 17 00:00:00 2001
From: nomisto <simonott@gmx.at>
Date: Tue, 12 Apr 2022 10:03:15 +0200
Subject: [PATCH 1/4] Initial mediqa ans dataset

---
 biodatasets/mediqa_ans/mediqa_ans.py | 263 +++++++++++++++++++++++++++
 1 file changed, 263 insertions(+)
 create mode 100644 biodatasets/mediqa_ans/mediqa_ans.py

diff --git a/biodatasets/mediqa_ans/mediqa_ans.py b/biodatasets/mediqa_ans/mediqa_ans.py
new file mode 100644
index 00000000..58579941
--- /dev/null
+++ b/biodatasets/mediqa_ans/mediqa_ans.py
@@ -0,0 +1,263 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health 
+Questions. The first summarization collection containing question-driven summaries of answers to consumer health 
+questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using 
+extractive or abstractive approaches."""
+
+import itertools as it
+import json
+import os
+from typing import List, Tuple, Dict
+
+import datasets
+from utils import schemas
+from utils.configs import BigBioConfig
+from utils.constants import Tasks
+
+_CITATION = """\
+@article{,
+    author={Savery, Max
+        and Abacha, Asma Ben
+        and Gayen, Soumya
+        and Demner-Fushman, Dina},
+    title={Question-driven summarization of answers to consumer health questions},
+    journal={Scientific Data},
+    year={2020},
+    month={Oct},
+    day={02},
+    volume={7},
+    number={1},
+    pages={322},
+    issn={2052-4463},
+    doi={10.1038/s41597-020-00667-z},
+    url={https://doi.org/10.1038/s41597-020-00667-z}
+}
+"""
+
+_DATASETNAME = "mediqa_ans"
+
+_DESCRIPTION = """\
+Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health 
+Questions. The first summarization collection containing question-driven summaries of answers to consumer health 
+questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using 
+extractive or abstractive approaches.
+"""
+
+_HOMEPAGE = "https://osf.io/fyg46/"
+
+_LICENSE = "CC0"
+
+_URLS = {
+    _DATASETNAME: "https://osf.io/fs57e/download",
+}
+
+_SUPPORTED_TASKS = [Tasks.SUMMARIZATION]
+
+_SOURCE_VERSION = "1.0.0"
+
+_BIGBIO_VERSION = "1.0.0"
+
+class MediqaAnsDataset(datasets.GeneratorBasedBuilder):
+    """A dataset of manually generated, question-driven summaries of multi and single document answers to consumer health questions."""
+
+    SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
+    BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
+
+    BUILDER_CONFIGS = []
+
+    BUILDER_CONFIGS.append(
+        BigBioConfig(
+            name=f"mediqa_ans_all_source",
+            version=BIGBIO_VERSION,
+            description=f"MEDIQA-AnS All source schema",
+            schema="source",
+            subset_id=f"mediqa_ans_all",
+        ),
+    )
+
+
+    for setting1 in ["page2answer", "section2answer"]:
+        for setting2 in ["multi", "single"]:
+            for setting3 in ["abstractive", "extractive"]:
+                BUILDER_CONFIGS.append(
+                    BigBioConfig(
+                        name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_bigbio_t2t",
+                        version=BIGBIO_VERSION,
+                        description=f"MEDIQA-AnS {setting1} {setting2.capitalize()} {setting3.capitalize()} BigBio schema",
+                        schema="bigbio_t2t",
+                        subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}",
+                    )
+                )
+                BUILDER_CONFIGS.append(
+                    BigBioConfig(
+                        name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_source",
+                        version=BIGBIO_VERSION,
+                        description=f"MEDIQA-AnS {setting1} {setting2.capitalize()} {setting3.capitalize()} source schema",
+                        schema="source",
+                        subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}",
+                    ),
+                )
+
+    DEFAULT_CONFIG_NAME = "mediqa_ans_page2answer_multi_abstractive_source"
+
+    def _info(self) -> datasets.DatasetInfo:
+
+        if self.config.schema == "source" and self.config.subset_id == "mediqa_ans_all":
+            features = datasets.Features(
+                {
+                    "id": datasets.Value("string"),
+                    "question": datasets.Value("string"),
+                    "multi_abs_summ": datasets.Value("string"),
+                    "multi_ext_summ": datasets.Value("string"),
+                    "answers": [
+                        {
+                            "id":  datasets.Value("string"),
+                            "answer_abs_summ": datasets.Value("string"),
+                            "answer_ext_summ": datasets.Value("string"),
+                            "section": datasets.Value("string"),
+                            "article": datasets.Value("string"),
+                            "url": datasets.Value("string"),
+                            "rating": datasets.Value("string"),
+                        }
+                    ],
+                }
+            )
+        elif self.config.schema == "source":
+            features = datasets.Features(
+                {
+                    "question": datasets.Value("string"),
+                    "question_id": datasets.Value("string"),
+                    "summary": datasets.Value("string"),
+                    "articles": [{
+                        "answer_id": datasets.Value("string"),
+                        "text": datasets.Value("string"),
+                        "rating": datasets.Value("string"),
+                    }]
+                }
+            )
+        elif self.config.schema == "bigbio_t2t":
+            features = schemas.text2text_features
+
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
+        """Returns SplitGenerators."""
+        
+        urls = _URLS[_DATASETNAME]
+        file_path = dl_manager.download_and_extract(urls)
+
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(file_path),
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath) -> Tuple[int, Dict]:
+        """Yields examples as (key, example) tuples."""
+        
+        dataset = None
+        with open(filepath, "r", encoding="utf8") as infile:
+            dataset = json.load(infile)
+
+        
+
+        uid = it.count(0)
+        if self.config.name == "mediqa_ans_all_source":
+            dataset = self._json_dict_to_list(dataset, "id")
+            for example in dataset:
+                example["answers"] = self._json_dict_to_list(example["answers"], "id")
+                yield example["id"], example
+        else:
+            _, setting1, setting2, setting3 = self.config.subset_id.rsplit("_",3)
+            if self.config.schema == "source":
+                for example in self._generate_setting_examples(dataset, setting1, setting2, setting3):
+                    yield next(uid), example
+            elif self.config.schema == "bigbio_t2t":
+                for example in self._generate_setting_examples(dataset, setting1, setting2, setting3):
+                    example = self._source_to_t2t(example)
+                    example["id"] = next(uid)
+                    yield example["id"], example
+
+    def _generate_setting_examples(self, dataset, setting1, setting2, setting3):
+        for question_id, question in dataset.items():
+            example = {}
+            example["question_id"] = question_id
+            example["question"] = question["question"]
+            if setting2 == "single":
+                for answer_id, answer in question["answers"].items():
+                    example_ = example.copy()
+                    if setting1 == "section2answer":
+                        example_["articles"] = [{"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]}]
+                    elif setting1 == "page2answer":
+                        example_["articles"] = [{"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]}]
+                    if setting3 == "abstractive":
+                        example_["summary"] = answer["answer_abs_summ"]
+                    elif setting3 == "extractive":
+                        example_["summary"] = answer["answer_ext_summ"]
+                    yield example_
+            elif setting2 == "multi":
+                example["articles"] = []
+                for answer_id, answer in question["answers"].items():
+                    if setting1 == "section2answer":
+                        example["articles"].append({"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]})
+                    elif setting1 == "page2answer":
+                        example["articles"].append({"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]})
+
+                if setting3 == "abstractive":
+                    example["summary"] = question["multi_abs_summ"]
+                elif setting3 == "extractive":
+                    example["summary"] = question["multi_ext_summ"]
+                yield example
+        
+    def _source_to_t2t(self, example):
+        example_ = {}
+        example_["document_id"] = ""
+        example_["text_1_name"] = ""
+        example_["text_2_name"] = ""
+
+        text1 = ""
+        text1 += "Question ID: " + example["question_id"] + "\n"
+        text1 += "Question: " + example["question"] + "\n"
+        for article in example["articles"]:
+            text1 += "Answer ID: " + article["answer_id"] + "\n"
+            text1 += "Text: " + article["text"] + "\n"
+            text1 += "Rating: " + article["rating"] + "\n"
+        example_["text_1"] = text1
+
+        example_["text_2"] = example["summary"]
+        
+        return example_
+
+    def _json_dict_to_list(self, json, new_key):
+        list_ = []
+        for key, values in json.items():
+            assert isinstance(values, dict), "Child element is not a dict"
+            assert (new_key not in values), "New key already in values"
+            values[new_key] = key
+            list_.append(values)
+        return list_
+

From 3090d0681e3849099c2c80cf6c85c4c17d79a95c Mon Sep 17 00:00:00 2001
From: nomisto <simonott@gmx.at>
Date: Tue, 12 Apr 2022 10:19:19 +0200
Subject: [PATCH 2/4] changed label 'Text' to 'Answer' in text_1

---
 biodatasets/mediqa_ans/mediqa_ans.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/biodatasets/mediqa_ans/mediqa_ans.py b/biodatasets/mediqa_ans/mediqa_ans.py
index 58579941..848ec61c 100644
--- a/biodatasets/mediqa_ans/mediqa_ans.py
+++ b/biodatasets/mediqa_ans/mediqa_ans.py
@@ -244,7 +244,7 @@ def _source_to_t2t(self, example):
         text1 += "Question: " + example["question"] + "\n"
         for article in example["articles"]:
             text1 += "Answer ID: " + article["answer_id"] + "\n"
-            text1 += "Text: " + article["text"] + "\n"
+            text1 += "Answer: " + article["text"] + "\n"
             text1 += "Rating: " + article["rating"] + "\n"
         example_["text_1"] = text1
 

From 30339253732d518e8f9048201edc420b59652822 Mon Sep 17 00:00:00 2001
From: nomisto <simonott@gmx.at>
Date: Tue, 12 Apr 2022 10:49:08 +0200
Subject: [PATCH 3/4] reformat

---
 biodatasets/mediqa_ans/mediqa_ans.py | 77 ++++++++++++++++------------
 1 file changed, 44 insertions(+), 33 deletions(-)

diff --git a/biodatasets/mediqa_ans/mediqa_ans.py b/biodatasets/mediqa_ans/mediqa_ans.py
index 848ec61c..e1fa5d97 100644
--- a/biodatasets/mediqa_ans/mediqa_ans.py
+++ b/biodatasets/mediqa_ans/mediqa_ans.py
@@ -14,17 +14,18 @@
 # limitations under the License.
 
 """
-Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health 
-Questions. The first summarization collection containing question-driven summaries of answers to consumer health 
-questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using 
+Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health
+Questions. The first summarization collection containing question-driven summaries of answers to consumer health
+questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using
 extractive or abstractive approaches."""
 
 import itertools as it
 import json
 import os
-from typing import List, Tuple, Dict
+from typing import Dict, List, Tuple
 
 import datasets
+
 from utils import schemas
 from utils.configs import BigBioConfig
 from utils.constants import Tasks
@@ -52,9 +53,9 @@
 _DATASETNAME = "mediqa_ans"
 
 _DESCRIPTION = """\
-Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health 
-Questions. The first summarization collection containing question-driven summaries of answers to consumer health 
-questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using 
+Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health
+Questions. The first summarization collection containing question-driven summaries of answers to consumer health
+questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using
 extractive or abstractive approaches.
 """
 
@@ -72,8 +73,12 @@
 
 _BIGBIO_VERSION = "1.0.0"
 
+
 class MediqaAnsDataset(datasets.GeneratorBasedBuilder):
-    """A dataset of manually generated, question-driven summaries of multi and single document answers to consumer health questions."""
+    """
+    A dataset of manually generated, question-driven summaries of multi and
+    single document answers to consumer health questions.
+    """
 
     SOURCE_VERSION = datasets.Version(_SOURCE_VERSION)
     BIGBIO_VERSION = datasets.Version(_BIGBIO_VERSION)
@@ -82,15 +87,14 @@ class MediqaAnsDataset(datasets.GeneratorBasedBuilder):
 
     BUILDER_CONFIGS.append(
         BigBioConfig(
-            name=f"mediqa_ans_all_source",
+            name="mediqa_ans_all_source",
             version=BIGBIO_VERSION,
-            description=f"MEDIQA-AnS All source schema",
+            description="MEDIQA-AnS All source schema",
             schema="source",
-            subset_id=f"mediqa_ans_all",
+            subset_id="mediqa_ans_all",
         ),
     )
 
-
     for setting1 in ["page2answer", "section2answer"]:
         for setting2 in ["multi", "single"]:
             for setting3 in ["abstractive", "extractive"]:
@@ -98,7 +102,7 @@ class MediqaAnsDataset(datasets.GeneratorBasedBuilder):
                     BigBioConfig(
                         name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_bigbio_t2t",
                         version=BIGBIO_VERSION,
-                        description=f"MEDIQA-AnS {setting1} {setting2.capitalize()} {setting3.capitalize()} BigBio schema",
+                        description=f"MEDIQA-AnS {setting1} {setting2} {setting3} BigBio schema",
                         schema="bigbio_t2t",
                         subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}",
                     )
@@ -107,7 +111,7 @@ class MediqaAnsDataset(datasets.GeneratorBasedBuilder):
                     BigBioConfig(
                         name=f"mediqa_ans_{setting1}_{setting2}_{setting3}_source",
                         version=BIGBIO_VERSION,
-                        description=f"MEDIQA-AnS {setting1} {setting2.capitalize()} {setting3.capitalize()} source schema",
+                        description=f"MEDIQA-AnS {setting1} {setting2} {setting3} source schema",
                         schema="source",
                         subset_id=f"mediqa_ans_{setting1}_{setting2}_{setting3}",
                     ),
@@ -126,7 +130,7 @@ def _info(self) -> datasets.DatasetInfo:
                     "multi_ext_summ": datasets.Value("string"),
                     "answers": [
                         {
-                            "id":  datasets.Value("string"),
+                            "id": datasets.Value("string"),
                             "answer_abs_summ": datasets.Value("string"),
                             "answer_ext_summ": datasets.Value("string"),
                             "section": datasets.Value("string"),
@@ -143,11 +147,13 @@ def _info(self) -> datasets.DatasetInfo:
                     "question": datasets.Value("string"),
                     "question_id": datasets.Value("string"),
                     "summary": datasets.Value("string"),
-                    "articles": [{
-                        "answer_id": datasets.Value("string"),
-                        "text": datasets.Value("string"),
-                        "rating": datasets.Value("string"),
-                    }]
+                    "articles": [
+                        {
+                            "answer_id": datasets.Value("string"),
+                            "text": datasets.Value("string"),
+                            "rating": datasets.Value("string"),
+                        }
+                    ],
                 }
             )
         elif self.config.schema == "bigbio_t2t":
@@ -163,7 +169,7 @@ def _info(self) -> datasets.DatasetInfo:
 
     def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
         """Returns SplitGenerators."""
-        
+
         urls = _URLS[_DATASETNAME]
         file_path = dl_manager.download_and_extract(urls)
 
@@ -178,13 +184,11 @@ def _split_generators(self, dl_manager) -> List[datasets.SplitGenerator]:
 
     def _generate_examples(self, filepath) -> Tuple[int, Dict]:
         """Yields examples as (key, example) tuples."""
-        
+
         dataset = None
         with open(filepath, "r", encoding="utf8") as infile:
             dataset = json.load(infile)
 
-        
-
         uid = it.count(0)
         if self.config.name == "mediqa_ans_all_source":
             dataset = self._json_dict_to_list(dataset, "id")
@@ -192,7 +196,7 @@ def _generate_examples(self, filepath) -> Tuple[int, Dict]:
                 example["answers"] = self._json_dict_to_list(example["answers"], "id")
                 yield example["id"], example
         else:
-            _, setting1, setting2, setting3 = self.config.subset_id.rsplit("_",3)
+            _, setting1, setting2, setting3 = self.config.subset_id.rsplit("_", 3)
             if self.config.schema == "source":
                 for example in self._generate_setting_examples(dataset, setting1, setting2, setting3):
                     yield next(uid), example
@@ -211,9 +215,13 @@ def _generate_setting_examples(self, dataset, setting1, setting2, setting3):
                 for answer_id, answer in question["answers"].items():
                     example_ = example.copy()
                     if setting1 == "section2answer":
-                        example_["articles"] = [{"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]}]
+                        example_["articles"] = [
+                            {"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]}
+                        ]
                     elif setting1 == "page2answer":
-                        example_["articles"] = [{"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]}]
+                        example_["articles"] = [
+                            {"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]}
+                        ]
                     if setting3 == "abstractive":
                         example_["summary"] = answer["answer_abs_summ"]
                     elif setting3 == "extractive":
@@ -223,16 +231,20 @@ def _generate_setting_examples(self, dataset, setting1, setting2, setting3):
                 example["articles"] = []
                 for answer_id, answer in question["answers"].items():
                     if setting1 == "section2answer":
-                        example["articles"].append({"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]})
+                        example["articles"].append(
+                            {"answer_id": answer_id, "text": answer["section"], "rating": answer["rating"]}
+                        )
                     elif setting1 == "page2answer":
-                        example["articles"].append({"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]})
+                        example["articles"].append(
+                            {"answer_id": answer_id, "text": answer["article"], "rating": answer["rating"]}
+                        )
 
                 if setting3 == "abstractive":
                     example["summary"] = question["multi_abs_summ"]
                 elif setting3 == "extractive":
                     example["summary"] = question["multi_ext_summ"]
                 yield example
-        
+
     def _source_to_t2t(self, example):
         example_ = {}
         example_["document_id"] = ""
@@ -249,15 +261,14 @@ def _source_to_t2t(self, example):
         example_["text_1"] = text1
 
         example_["text_2"] = example["summary"]
-        
+
         return example_
 
     def _json_dict_to_list(self, json, new_key):
         list_ = []
         for key, values in json.items():
             assert isinstance(values, dict), "Child element is not a dict"
-            assert (new_key not in values), "New key already in values"
+            assert new_key not in values, "New key already in values"
             values[new_key] = key
             list_.append(values)
         return list_
-

From b07671c2cc7e7910cab1674b34b09f473b542b2b Mon Sep 17 00:00:00 2001
From: Simon Ott <simonott@gmx.at>
Date: Tue, 26 Apr 2022 09:56:56 +0200
Subject: [PATCH 4/4] Added description of subsets

---
 biodatasets/mediqa_ans/mediqa_ans.py | 77 +++++++++++++++++++++++++++-
 1 file changed, 76 insertions(+), 1 deletion(-)

diff --git a/biodatasets/mediqa_ans/mediqa_ans.py b/biodatasets/mediqa_ans/mediqa_ans.py
index e1fa5d97..55d627f9 100644
--- a/biodatasets/mediqa_ans/mediqa_ans.py
+++ b/biodatasets/mediqa_ans/mediqa_ans.py
@@ -17,7 +17,45 @@
 Medical Question-Answer Summarization (MEDIQA-AnS): Question-Driven Summarization of Answers to Consumer Health
 Questions. The first summarization collection containing question-driven summaries of answers to consumer health
 questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using
-extractive or abstractive approaches."""
+extractive or abstractive approaches.
+
+This dataset contains 8 different subsets which support both source and bigbio schema. These subsets arise from
+the permutation of three different settings:
+- [page2answer, section2answer]: Full page as context / manually selected passages as context
+- [multi, single]: Generate summary of answer for specific question across multiple documents / 
+                   Generate summary of answer for specific question for each document (each document is sample)
+- [abstractive, extractive]: Abstractive summary / extractive summary
+
+
+List of subset_ids:
+- mediqa_ans_page2answer_multi_abstractive
+  This split contains the question ID and question, the answer ID and the full text of the web pages, the
+  corresponding rating for each answer, and the multi-document abstractive summary. 
+- mediqa_ans_page2answer_multi_extractive
+  Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating
+  for each answer, and the multi-document extractive summary. 
+- mediqa_ans_page2answer_single_abstractive
+  Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating
+  for each answer, and the single document abstractive summary for each answer.
+- mediqa_ans_page2answer_single_extractive
+  Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating
+  for each answer, and the single document extractive summary for each answer.
+- mediqa_ans_section2answer_multi_abstractive
+  Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
+  for each answer, and the multi-document abstractive summary for each answer.
+- mediqa_ans_section2answer_multi_extractive
+  Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
+  for each answer, and the multi-document extractive summary for each answer.
+- mediqa_ans_section2answer_single_abstractive
+  Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
+  for each answer, and the single document abstractive summary for each answer.
+- mediqa_ans_section2answer_single_extractive
+  Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
+  for each answer, and the single document extractive summary for each answer.
+
+Furthermore there exists the subset mediqa_ans_all for which there only exists a source schema and contains
+all questions, pages, passages, ratings, urls, and each type of summaries.
+"""
 
 import itertools as it
 import json
@@ -57,6 +95,43 @@
 Questions. The first summarization collection containing question-driven summaries of answers to consumer health
 questions. This dataset can be used to evaluate single or multi-document summaries generated by algorithms using
 extractive or abstractive approaches.
+
+This dataset contains 8 different subsets which support both source and bigbio schema. These subsets arise from
+the permutation of three different settings:
+- [page2answer, section2answer]: Full page as context / manually selected passages as context
+- [multi, single]: Generate summary of answer for specific question across multiple documents / 
+                   Generate summary of answer for specific question for each document (each document is sample)
+- [abstractive, extractive]: Abstractive summary / extractive summary
+
+
+List of subset_ids:
+- mediqa_ans_page2answer_multi_abstractive
+  This split contains the question ID and question, the answer ID and the full text of the web pages, the
+  corresponding rating for each answer, and the multi-document abstractive summary. 
+- mediqa_ans_page2answer_multi_extractive
+  Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating
+  for each answer, and the multi-document extractive summary. 
+- mediqa_ans_page2answer_single_abstractive
+  Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating
+  for each answer, and the single document abstractive summary for each answer.
+- mediqa_ans_page2answer_single_extractive
+  Contains the question ID and question, the answer ID and full text of the web pages, the corresponding rating
+  for each answer, and the single document extractive summary for each answer.
+- mediqa_ans_section2answer_multi_abstractive
+  Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
+  for each answer, and the multi-document abstractive summary for each answer.
+- mediqa_ans_section2answer_multi_extractive
+  Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
+  for each answer, and the multi-document extractive summary for each answer.
+- mediqa_ans_section2answer_single_abstractive
+  Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
+  for each answer, and the single document abstractive summary for each answer.
+- mediqa_ans_section2answer_single_extractive
+  Contains the question ID and question, the answer ID and manually selected passages, the corresponding rating
+  for each answer, and the single document extractive summary for each answer.
+
+Furthermore there exists the subset mediqa_ans_all for which there only exists a source schema and contains
+all questions, pages, passages, ratings, urls, and each type of summaries.
 """
 
 _HOMEPAGE = "https://osf.io/fyg46/"