opentecr · ruslan-uoc · Jan 19, 2025 · Jan 21, 2025
diff --git a/ruslan/DOI2META/Task_18_prepare_a_pipeline_to_pull_data_about_publications.ipynb b/ruslan/DOI2META/Task_18_prepare_a_pipeline_to_pull_data_about_publications.ipynb
@@ -0,0 +1,350 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "URaGvJWXCSPH",
+        "outputId": "e521ecc0-3dd4-4650-fdfd-edf5274cd18c"
+      },
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (2.32.3)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests) (3.4.1)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests) (3.10)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests) (2.3.0)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests) (2024.12.14)\n"
+          ]
+        }
+      ],
+      "source": [
+        "pip install requests"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pip install biopython"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "jgO7mM4FHfZ7",
+        "outputId": "7c219e20-71a5-48b6-abb9-21346f089fde"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Requirement already satisfied: biopython in /usr/local/lib/python3.11/dist-packages (1.85)\n",
+            "Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from biopython) (1.26.4)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pip install odfpy"
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "B9GVsFTnfV01",
+        "outputId": "3d359ad3-3a53-4cee-ef11-d44c2342c41c"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Requirement already satisfied: odfpy in /usr/local/lib/python3.11/dist-packages (1.4.1)\n",
+            "Requirement already satisfied: defusedxml in /usr/local/lib/python3.11/dist-packages (from odfpy) (0.7.1)\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "!wget -O \"openTECR recuration.ods\" \"https://docs.google.com/spreadsheets/d/1jLIxEXVzE2SAzIB0UxBfcFoHrzjzf9euB6ART2VDE8c/export?format=ods\""
+      ],
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "3b9q8AZgezQq",
+        "outputId": "c0d01a92-2a35-4588-d16e-b6e84c27e431"
+      },
+      "execution_count": null,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "--2025-01-19 09:11:24--  https://docs.google.com/spreadsheets/d/1jLIxEXVzE2SAzIB0UxBfcFoHrzjzf9euB6ART2VDE8c/export?format=ods\n",
+            "Resolving docs.google.com (docs.google.com)... 142.251.179.100, 142.251.179.138, 142.251.179.101, ...\n",
+            "Connecting to docs.google.com (docs.google.com)|142.251.179.100|:443... connected.\n",
+            "HTTP request sent, awaiting response... 307 Temporary Redirect\n",
+            "Location: https://doc-00-8k-sheets.googleusercontent.com/export/54bogvaave6cua4cdnls17ksc4/q2h8j528ksl67k4dceiprkoqoc/1737277880000/115120384215235060766/*/1jLIxEXVzE2SAzIB0UxBfcFoHrzjzf9euB6ART2VDE8c?format=ods [following]\n",
+            "Warning: wildcards not supported in HTTP.\n",
+            "--2025-01-19 09:11:24--  https://doc-00-8k-sheets.googleusercontent.com/export/54bogvaave6cua4cdnls17ksc4/q2h8j528ksl67k4dceiprkoqoc/1737277880000/115120384215235060766/*/1jLIxEXVzE2SAzIB0UxBfcFoHrzjzf9euB6ART2VDE8c?format=ods\n",
+            "Resolving doc-00-8k-sheets.googleusercontent.com (doc-00-8k-sheets.googleusercontent.com)... 172.253.115.132, 2607:f8b0:4004:c06::84\n",
+            "Connecting to doc-00-8k-sheets.googleusercontent.com (doc-00-8k-sheets.googleusercontent.com)|172.253.115.132|:443... connected.\n",
+            "HTTP request sent, awaiting response... 200 OK\n",
+            "Length: unspecified [application/vnd.oasis.opendocument.spreadsheet]\n",
+            "Saving to: ‘openTECR recuration.ods’\n",
+            "\n",
+            "openTECR recuration     [ <=>                ]   2.43M  --.-KB/s    in 0.07s   \n",
+            "\n",
+            "2025-01-19 09:11:47 (36.2 MB/s) - ‘openTECR recuration.ods’ saved [2549105]\n",
+            "\n"
+          ]
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import pandas as pd\n",
+        "from Bio import Entrez\n",
+        "import numpy as np\n",
+        "import time\n",
+        "\n",
+        "def crossref(doi):\n",
+        "    url = f\"https://api.crossref.org/works/{doi}\"\n",
+        "    response = requests.get(url)\n",
+        "    if response.status_code == 200:\n",
+        "        return response.json()\n",
+        "    else:\n",
+        "        return {\"error\": f\"DOI not found in CrossRef: {doi}\"}\n",
+        "\n",
+        "def pubmed(pmid):\n",
+        "    handle = Entrez.efetch(db=\"pubmed\", id=pmid, retmode=\"xml\")\n",
+        "    response = Entrez.read(handle)\n",
+        "    handle.close()\n",
+        "    return response\n",
+        "\n",
+        "def doires2meta(response):\n",
+        "  metadata = {}\n",
+        "  metadata['Title'] = response['message']['title']\n",
+        "  date = response['message']['created']['date-parts'][0]\n",
+        "  metadata['Date'] = f'{date[2]}.{date[1]}.{date[0]}'\n",
+        "  metadata['Publisher'] = response['message']['publisher']\n",
+        "  metadata['License'] = response['message']['license'][0]['URL']\n",
+        "  metadata['Type'] = response['message']['type']\n",
+        "  metadata['Volume'] = response['message']['volume']\n",
+        "  metadata['Issue'] = response['message']['issue']\n",
+        "  metadata['Page'] = response['message']['page']\n",
+        "  fn = []\n",
+        "  sn = []\n",
+        "  for b in response['message']['author']:\n",
+        "    fn.append(b['given'])\n",
+        "    sn.append(b['family'])\n",
+        "\n",
+        "  string = ''\n",
+        "\n",
+        "  for i in range(len(fn)):\n",
+        "    string = string + fn[i] + \" \" + sn[i] + \", \"\n",
+        "\n",
+        "  string = string[:-2]\n",
+        "  metadata['Authors'] = string\n",
+        "  metadata['Language'] = response['message']['language']\n",
+        "  return metadata\n",
+        "\n",
+        "def pubmed2meta(response):\n",
+        "  metadata = {}\n",
+        "  try:\n",
+        "    metadata['Language'] = response['PubmedArticle'][0]['MedlineCitation']['Article']['Language']\n",
+        "  except:\n",
+        "    metadata['Language'] = '-'\n",
+        "  try:\n",
+        "    metadata['Volume'] = response['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['Volume']\n",
+        "  except:\n",
+        "    metadata['Volume'] = '-'\n",
+        "  try:\n",
+        "    metadata['Issue'] = response['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['Issue']\n",
+        "  except:\n",
+        "    metadata['Issue'] = \"-\"\n",
+        "  try:\n",
+        "    metadata['Page'] = response['PubmedArticle'][0]['MedlineCitation']['Article']['Pagination']['MedlinePgn']\n",
+        "  except:\n",
+        "    metadata['Page'] = \"-\"\n",
+        "  date  = response['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']\n",
+        "  try:\n",
+        "    metadata['Journal'] = response['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['Title']\n",
+        "  except:\n",
+        "    metadata['Journal'] = \"-\"\n",
+        "  try:\n",
+        "    metadata['Abstract'] = response['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]\n",
+        "  except:\n",
+        "    metadata['Abstract'] = '-'\n",
+        "  #print(date)\n",
+        "  try:\n",
+        "    month = date['Month']\n",
+        "  except:\n",
+        "    month = ' '\n",
+        "  year = date['Year']\n",
+        "  metadata['Date'] = f'{month} {year}'\n",
+        "  try:\n",
+        "    fn = []\n",
+        "    sn = []\n",
+        "    for b in response['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList']:\n",
+        "      fn.append(b['ForeName'])\n",
+        "      sn.append(b['LastName'])\n",
+        "\n",
+        "    string = ''\n",
+        "\n",
+        "    for i in range(len(fn)):\n",
+        "      string = string + fn[i] + \" \" + sn[i] + \", \"\n",
+        "\n",
+        "    string = string[:-2]\n",
+        "    metadata['Authors'] = string\n",
+        "  except:\n",
+        "    metadata['Authors'] = \"-\"\n",
+        "  return metadata\n",
+        "\n",
+        "def return_blank(indi):\n",
+        "  metadata = {}\n",
+        "  if indi == \"pub\":\n",
+        "    metadata['Language'] = '-'\n",
+        "    metadata['Volume'] = '-'\n",
+        "    metadata['Issue'] = '-'\n",
+        "    metadata['Page'] = '-'\n",
+        "    metadata['Journal'] = '-'\n",
+        "    metadata['Abstract'] = '-'\n",
+        "    metadata['Date'] = '-'\n",
+        "    metadata['Authors'] = '-'\n",
+        "  if indi == \"doi\":\n",
+        "    metadata['Title'] = '-'\n",
+        "    metadata['Language'] = '-'\n",
+        "    metadata['Volume'] = '-'\n",
+        "    metadata['Issue'] = '-'\n",
+        "    metadata['Page'] = '-'\n",
+        "    metadata['Publisher'] = '-'\n",
+        "    metadata['Date'] = '-'\n",
+        "    metadata['Authors'] = '-'\n",
+        "    metadata['License'] = '-'\n",
+        "    metadata['Type'] = '-'\n",
+        "  return metadata"
+      ],
+      "metadata": {
+        "id": "un4HAXdiGVSB"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "Entrez.email = \"[email protected]\"\n",
+        "df = pd.read_excel(\"openTECR recuration.ods\", sheet_name=\"references\")\n",
+        "pmids = df['pmid'].to_list()\n",
+        "dois = df['doi'].to_list()\n"
+      ],
+      "metadata": {
+        "id": "8YsLraZn6JKK"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "doismeta = []\n",
+        "for d in dois:\n",
+        "  if type(d) == type('Is a string?'):\n",
+        "    r = crossref(d)\n",
+        "    #print(r)\n",
+        "\n",
+        "    try:\n",
+        "      d = doires2meta(r)\n",
+        "    except:\n",
+        "      d = return_blank('doi')\n",
+        "\n",
+        "    doismeta.append(d)\n",
+        "  else:\n",
+        "    c = return_blank('doi')\n",
+        "    doismeta.append(c)"
+      ],
+      "metadata": {
+        "collapsed": true,
+        "id": "Nx273Y9i_qpF"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "pmidsmeta = []\n",
+        "for pmid in pmids:\n",
+        "  if np.isnan(pmid) == False:\n",
+        "    time.sleep(1)\n",
+        "    r = pubmed(str(pmid))\n",
+        "    #print(r)\n",
+        "    d = pubmed2meta(r)\n",
+        "    pmidsmeta.append(d)\n",
+        "  else:\n",
+        "    c = return_blank('pub')\n",
+        "    pmidsmeta.append(c)"
+      ],
+      "metadata": {
+        "id": "XhQRgqi11jVy"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dfpub = pd.DataFrame(pmidsmeta)\n",
+        "dfdoi = pd.DataFrame(doismeta)"
+      ],
+      "metadata": {
+        "id": "UIRdf5jcKqmD"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "dfpubC = pd.concat([df, dfpub], axis=1)\n",
+        "dfdoiC = pd.concat([df, dfdoi], axis=1)\n",
+        "dfpubC.to_csv('openTECRmetadataPubMed.csv')\n",
+        "dfdoiC.to_csv('openTECRmetadataDOI.csv')"
+      ],
+      "metadata": {
+        "id": "L7zvwcIHLins"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}