Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,350 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "URaGvJWXCSPH",
"outputId": "e521ecc0-3dd4-4650-fdfd-edf5274cd18c"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (2.32.3)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests) (3.4.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests) (3.10)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests) (2.3.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests) (2024.12.14)\n"
]
}
],
"source": [
"pip install requests"
]
},
{
"cell_type": "code",
"source": [
"pip install biopython"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "jgO7mM4FHfZ7",
"outputId": "7c219e20-71a5-48b6-abb9-21346f089fde"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: biopython in /usr/local/lib/python3.11/dist-packages (1.85)\n",
"Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (from biopython) (1.26.4)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"pip install odfpy"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "B9GVsFTnfV01",
"outputId": "3d359ad3-3a53-4cee-ef11-d44c2342c41c"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Requirement already satisfied: odfpy in /usr/local/lib/python3.11/dist-packages (1.4.1)\n",
"Requirement already satisfied: defusedxml in /usr/local/lib/python3.11/dist-packages (from odfpy) (0.7.1)\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"!wget -O \"openTECR recuration.ods\" \"https://docs.google.com/spreadsheets/d/1jLIxEXVzE2SAzIB0UxBfcFoHrzjzf9euB6ART2VDE8c/export?format=ods\""
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3b9q8AZgezQq",
"outputId": "c0d01a92-2a35-4588-d16e-b6e84c27e431"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"--2025-01-19 09:11:24-- https://docs.google.com/spreadsheets/d/1jLIxEXVzE2SAzIB0UxBfcFoHrzjzf9euB6ART2VDE8c/export?format=ods\n",
"Resolving docs.google.com (docs.google.com)... 142.251.179.100, 142.251.179.138, 142.251.179.101, ...\n",
"Connecting to docs.google.com (docs.google.com)|142.251.179.100|:443... connected.\n",
"HTTP request sent, awaiting response... 307 Temporary Redirect\n",
"Location: https://doc-00-8k-sheets.googleusercontent.com/export/54bogvaave6cua4cdnls17ksc4/q2h8j528ksl67k4dceiprkoqoc/1737277880000/115120384215235060766/*/1jLIxEXVzE2SAzIB0UxBfcFoHrzjzf9euB6ART2VDE8c?format=ods [following]\n",
"Warning: wildcards not supported in HTTP.\n",
"--2025-01-19 09:11:24-- https://doc-00-8k-sheets.googleusercontent.com/export/54bogvaave6cua4cdnls17ksc4/q2h8j528ksl67k4dceiprkoqoc/1737277880000/115120384215235060766/*/1jLIxEXVzE2SAzIB0UxBfcFoHrzjzf9euB6ART2VDE8c?format=ods\n",
"Resolving doc-00-8k-sheets.googleusercontent.com (doc-00-8k-sheets.googleusercontent.com)... 172.253.115.132, 2607:f8b0:4004:c06::84\n",
"Connecting to doc-00-8k-sheets.googleusercontent.com (doc-00-8k-sheets.googleusercontent.com)|172.253.115.132|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: unspecified [application/vnd.oasis.opendocument.spreadsheet]\n",
"Saving to: ‘openTECR recuration.ods’\n",
"\n",
"openTECR recuration [ <=> ] 2.43M --.-KB/s in 0.07s \n",
"\n",
"2025-01-19 09:11:47 (36.2 MB/s) - ‘openTECR recuration.ods’ saved [2549105]\n",
"\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import pandas as pd\n",
"from Bio import Entrez\n",
"import numpy as np\n",
"import time\n",
"\n",
"def crossref(doi):\n",
" url = f\"https://api.crossref.org/works/{doi}\"\n",
" response = requests.get(url)\n",
" if response.status_code == 200:\n",
" return response.json()\n",
" else:\n",
" return {\"error\": f\"DOI not found in CrossRef: {doi}\"}\n",
"\n",
"def pubmed(pmid):\n",
" handle = Entrez.efetch(db=\"pubmed\", id=pmid, retmode=\"xml\")\n",
" response = Entrez.read(handle)\n",
" handle.close()\n",
" return response\n",
"\n",
"def doires2meta(response):\n",
" metadata = {}\n",
" metadata['Title'] = response['message']['title']\n",
" date = response['message']['created']['date-parts'][0]\n",
" metadata['Date'] = f'{date[2]}.{date[1]}.{date[0]}'\n",
" metadata['Publisher'] = response['message']['publisher']\n",
" metadata['License'] = response['message']['license'][0]['URL']\n",
" metadata['Type'] = response['message']['type']\n",
" metadata['Volume'] = response['message']['volume']\n",
" metadata['Issue'] = response['message']['issue']\n",
" metadata['Page'] = response['message']['page']\n",
" fn = []\n",
" sn = []\n",
" for b in response['message']['author']:\n",
" fn.append(b['given'])\n",
" sn.append(b['family'])\n",
"\n",
" string = ''\n",
"\n",
" for i in range(len(fn)):\n",
" string = string + fn[i] + \" \" + sn[i] + \", \"\n",
"\n",
" string = string[:-2]\n",
" metadata['Authors'] = string\n",
" metadata['Language'] = response['message']['language']\n",
" return metadata\n",
"\n",
"def pubmed2meta(response):\n",
" metadata = {}\n",
" try:\n",
" metadata['Language'] = response['PubmedArticle'][0]['MedlineCitation']['Article']['Language']\n",
" except:\n",
" metadata['Language'] = '-'\n",
" try:\n",
" metadata['Volume'] = response['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['Volume']\n",
" except:\n",
" metadata['Volume'] = '-'\n",
" try:\n",
" metadata['Issue'] = response['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['Issue']\n",
" except:\n",
" metadata['Issue'] = \"-\"\n",
" try:\n",
" metadata['Page'] = response['PubmedArticle'][0]['MedlineCitation']['Article']['Pagination']['MedlinePgn']\n",
" except:\n",
" metadata['Page'] = \"-\"\n",
" date = response['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']\n",
" try:\n",
" metadata['Journal'] = response['PubmedArticle'][0]['MedlineCitation']['Article']['Journal']['Title']\n",
" except:\n",
" metadata['Journal'] = \"-\"\n",
" try:\n",
" metadata['Abstract'] = response['PubmedArticle'][0]['MedlineCitation']['Article']['Abstract']['AbstractText'][0]\n",
" except:\n",
" metadata['Abstract'] = '-'\n",
" #print(date)\n",
" try:\n",
" month = date['Month']\n",
" except:\n",
" month = ' '\n",
" year = date['Year']\n",
" metadata['Date'] = f'{month} {year}'\n",
" try:\n",
" fn = []\n",
" sn = []\n",
" for b in response['PubmedArticle'][0]['MedlineCitation']['Article']['AuthorList']:\n",
" fn.append(b['ForeName'])\n",
" sn.append(b['LastName'])\n",
"\n",
" string = ''\n",
"\n",
" for i in range(len(fn)):\n",
" string = string + fn[i] + \" \" + sn[i] + \", \"\n",
"\n",
" string = string[:-2]\n",
" metadata['Authors'] = string\n",
" except:\n",
" metadata['Authors'] = \"-\"\n",
" return metadata\n",
"\n",
"def return_blank(indi):\n",
" metadata = {}\n",
" if indi == \"pub\":\n",
" metadata['Language'] = '-'\n",
" metadata['Volume'] = '-'\n",
" metadata['Issue'] = '-'\n",
" metadata['Page'] = '-'\n",
" metadata['Journal'] = '-'\n",
" metadata['Abstract'] = '-'\n",
" metadata['Date'] = '-'\n",
" metadata['Authors'] = '-'\n",
" if indi == \"doi\":\n",
" metadata['Title'] = '-'\n",
" metadata['Language'] = '-'\n",
" metadata['Volume'] = '-'\n",
" metadata['Issue'] = '-'\n",
" metadata['Page'] = '-'\n",
" metadata['Publisher'] = '-'\n",
" metadata['Date'] = '-'\n",
" metadata['Authors'] = '-'\n",
" metadata['License'] = '-'\n",
" metadata['Type'] = '-'\n",
" return metadata"
],
"metadata": {
"id": "un4HAXdiGVSB"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"Entrez.email = \"[email protected]\"\n",
"df = pd.read_excel(\"openTECR recuration.ods\", sheet_name=\"references\")\n",
"pmids = df['pmid'].to_list()\n",
"dois = df['doi'].to_list()\n"
],
"metadata": {
"id": "8YsLraZn6JKK"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"doismeta = []\n",
"for d in dois:\n",
" if type(d) == type('Is a string?'):\n",
" r = crossref(d)\n",
" #print(r)\n",
"\n",
" try:\n",
" d = doires2meta(r)\n",
" except:\n",
" d = return_blank('doi')\n",
"\n",
" doismeta.append(d)\n",
" else:\n",
" c = return_blank('doi')\n",
" doismeta.append(c)"
],
"metadata": {
"collapsed": true,
"id": "Nx273Y9i_qpF"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"pmidsmeta = []\n",
"for pmid in pmids:\n",
" if np.isnan(pmid) == False:\n",
" time.sleep(1)\n",
" r = pubmed(str(pmid))\n",
" #print(r)\n",
" d = pubmed2meta(r)\n",
" pmidsmeta.append(d)\n",
" else:\n",
" c = return_blank('pub')\n",
" pmidsmeta.append(c)"
],
"metadata": {
"id": "XhQRgqi11jVy"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"dfpub = pd.DataFrame(pmidsmeta)\n",
"dfdoi = pd.DataFrame(doismeta)"
],
"metadata": {
"id": "UIRdf5jcKqmD"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"dfpubC = pd.concat([df, dfpub], axis=1)\n",
"dfdoiC = pd.concat([df, dfdoi], axis=1)\n",
"dfpubC.to_csv('openTECRmetadataPubMed.csv')\n",
"dfdoiC.to_csv('openTECRmetadataDOI.csv')"
],
"metadata": {
"id": "L7zvwcIHLins"
},
"execution_count": null,
"outputs": []
}
]
}
Loading