Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ authors:
repository-code: 'https://github.com/fastdatascience/drug_named_entity_recognition'
url: 'https://fastdatascience.com/drug-named-entity-recognition-python-library/'
license: MIT
version: 2.0.9
version: 2.0.11
date-released: '2024-10-04'
url: 'https://zenodo.org/doi/10.5281/zenodo.10970631'
doi: 10.5281/zenodo.10970631
doi: 10.5281/zenodo.10970631
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ include cross_check_against_common_english_vocab.py
include find_short_drug_names.py
include smiles_example_python.py
include structure_mol_example_python.py
include *.sh
recursive-include harvesting_data_from_source *.py
recursive-include harvesting_data_from_source *.csv
recursive-include src *.bz2
Expand Down
16 changes: 14 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,18 @@ To the extent possible under law, the person who associated CC0 with the DrugBan

If you'd like to contribute to this project, you can contact us at https://fastdatascience.com/ or make a pull request on our [Github repository](https://github.com/fastdatascience/drug_named_entity_recognition). You can also [raise an issue](https://github.com/fastdatascience/drug_named_entity_recognition/issues).

## Future Improvements

### Data Storage Format Migration

Currently, the drug dictionary data and FuzzySet data structures are stored using Python's `pickle` format. Future work includes:

- **Migrate drug dictionary storage from pickle to JSON**: The drug dictionary data (`drug_variant_to_canonical`, `drug_canonical_to_data`, `drug_variant_to_variant_data`) should be stored in a standard JSON format instead of pickle for better portability, version control compatibility, and security.

- **Add JSON serialization support for FuzzySet**: The FuzzySet data structures (used for fuzzy matching) should be serializable to JSON format. This would allow pre-building FuzzySets during data preparation (`harvesting_data_from_source/07_combine_data_sources.py`) and loading them directly in `drugs_finder.py`, eliminating the need to rebuild them on every import and improving startup performance.

These improvements would make the data format more transparent, easier to inspect, and compatible with a wider range of tools and workflows.

## Developing the Drug Named Entity Recognition library

### Automated tests
Expand Down Expand Up @@ -405,12 +417,12 @@ MIT License. Copyright (c) 2023 [Fast Data Science](https://fastdatascience.com)

## ✍️ Citing the Drug Named Entity Recognition library

Wood, T.A., Drug Named Entity Recognition [Computer software], Version 2.0.9, accessed at [https://fastdatascience.com/drug-named-entity-recognition-python-library](https://fastdatascience.com/drug-named-entity-recognition-python-library), Fast Data Science Ltd (2024)
Wood, T.A., Drug Named Entity Recognition [Computer software], Version 2.0.11, accessed at [https://fastdatascience.com/drug-named-entity-recognition-python-library](https://fastdatascience.com/drug-named-entity-recognition-python-library), Fast Data Science Ltd (2024)

```
@unpublished{drugnamedentityrecognition,
AUTHOR = {Wood, T.A.},
TITLE = {Drug Named Entity Recognition (Computer software), Version 2.0.9},
TITLE = {Drug Named Entity Recognition (Computer software), Version 2.0.11},
YEAR = {2024},
Note = {To appear},
url = {https://zenodo.org/doi/10.5281/zenodo.10970631},
Expand Down
26 changes: 10 additions & 16 deletions harvesting_data_from_source/01_drugbank_download_vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@

import os
import re
import subprocess
from sys import platform

import requests

Expand All @@ -40,23 +38,19 @@

url = re_url.findall(response.text)[0]

print(f"Platform is {platform}.")
if "win" in platform: # if we are on Windows, use curl.exe (supported in Windows 10 and up)
tmpfile = "C:/temp/tmp.zip"
wget = subprocess.Popen(["curl.exe", "--output", tmpfile, "--url", url])
else:
tmpfile = "/tmp/tmp.zip"
wget = subprocess.Popen(["wget", "-O", tmpfile, url])
tmpfile = "/tmp/tmp.zip"
print(f"Downloading Drugbank dump from {url} to {tmpfile}...")
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes

os.waitpid(wget.pid, 0)
with open(tmpfile, 'wb') as f:
f.write(response.content)

print(f"Downloaded Drugbank dump from {url} to {tmpfile}.")

if "win" in platform: # if we are on Windows, use curl.exe (supported in Windows 10 and up)
unzip = subprocess.Popen(["unzip", -"o", tmpfile, "-d", "."])
else:
unzip = subprocess.Popen(["unzip", "-o", tmpfile, "-d", "."])

os.waitpid(unzip.pid, 0)
import zipfile
print(f"Unzipping Drugbank dump from {tmpfile} to current directory...")
with zipfile.ZipFile(tmpfile, 'r') as zip_ref:
zip_ref.extractall(".")

print(f"Unzipped Drugbank dump from {tmpfile} to current directory.")
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@
import csv
import datetime
import os
import subprocess
import requests
import xml.sax
from sys import platform

# Example URL of MeSH dump: https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/xmlmesh/desc2023.xml

Expand All @@ -47,13 +46,12 @@
print(
f"Downloading MeSH XML dump from {url}. If this URL doesn't work, please navigate to https://www.nlm.nih.gov/ and search the site for a MeSH data dump in XML format.")

print(f"Platform is {platform}.")
if "win" in platform: # if we are on Windows, use curl.exe (supported in Windows 10 and up)
wget = subprocess.Popen(["curl.exe", "--output", mesh_xml_file_name, "--url", url])
else:
wget = subprocess.Popen(["wget", url])
print("Downloading MeSH XML dump...")
response = requests.get(url)
response.raise_for_status() # Raise an exception for bad status codes

os.waitpid(wget.pid, 0)
with open(mesh_xml_file_name, 'wb') as f:
f.write(response.content)

print(f"Downloaded MeSH XML dump from {url}.")

Expand Down
61 changes: 31 additions & 30 deletions harvesting_data_from_source/05_download_smiles_from_pubchem.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,61 +31,62 @@
# Note that we have to get two files: one is Pubchem ID to SMILES, and one is Pubchem ID to MeSH name (not MeSH ID)
# We can later join these to get MeSH name to SMILES

import gzip
import os
import subprocess
from sys import platform
import requests
import shutil

url_pubchem_mesh = "https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-MeSH"
output_file_pubchem_mesh = "CID-MeSH"

print(f"Platform is {platform}.")
if "win" in platform: # if we are on Windows, use curl.exe (supported in Windows 10 and up)
wget = subprocess.Popen(["curl.exe", "--output", output_file_pubchem_mesh, "--url", url_pubchem_mesh])
else:
wget = subprocess.Popen(["wget", "-O", output_file_pubchem_mesh, url_pubchem_mesh])
# Download MeSH file
print(f"Downloading Pubchem MeSH dump for SMILES from {url_pubchem_mesh} to {output_file_pubchem_mesh}...")
response = requests.get(url_pubchem_mesh)
response.raise_for_status()

os.waitpid(wget.pid, 0)
with open(output_file_pubchem_mesh, 'wb') as f:
f.write(response.content)

print(f"Downloaded Pubchem MeSH dump for SMILES from {url_pubchem_mesh} to {output_file_pubchem_mesh}.")

# Download SMILES file
url_pubchem_smiles = "https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-SMILES.gz"
output_file_pubchem_smiles = "CID-SMILES.gz"

print(f"Platform is {platform}.")
if "win" in platform: # if we are on Windows, use curl.exe (supported in Windows 10 and up)
wget = subprocess.Popen(["curl.exe", "--output", output_file_pubchem_smiles, "--url", url_pubchem_smiles])
else:
wget = subprocess.Popen(["wget", "-O", output_file_pubchem_smiles, url_pubchem_smiles])
print(f"Downloading Pubchem SMILES data from {url_pubchem_smiles} to {output_file_pubchem_smiles}...")
response = requests.get(url_pubchem_smiles)
response.raise_for_status()

os.waitpid(wget.pid, 0)
with open(output_file_pubchem_smiles, 'wb') as f:
f.write(response.content)

print(f"Downloaded Pubchem SMILES data from {url_pubchem_smiles} to {output_file_pubchem_smiles}.")

# Download mass file
url_pubchem_mass = "https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-Mass.gz"
output_file_pubchem_mass = "CID-Mass.gz"

print(f"Platform is {platform}.")
if "win" in platform: # if we are on Windows, use curl.exe (supported in Windows 10 and up)
wget = subprocess.Popen(["curl.exe", "--output", output_file_pubchem_mass, "--url", url_pubchem_mass])
else:
wget = subprocess.Popen(["wget", "-O", output_file_pubchem_mass, url_pubchem_mass])
print(f"Downloading Pubchem mass data from {url_pubchem_mass} to {output_file_pubchem_mass}...")
response = requests.get(url_pubchem_mass)
response.raise_for_status()

os.waitpid(wget.pid, 0)
with open(output_file_pubchem_mass, 'wb') as f:
f.write(response.content)

print(f"Downloaded Pubchem mass data from {url_pubchem_mass} to {output_file_pubchem_mass}.")

print(f"Unzipping {output_file_pubchem_smiles}.")

unzip = subprocess.Popen(["gunzip", "-f", output_file_pubchem_smiles])

os.waitpid(unzip.pid, 0)
# Unzip SMILES file
print(f"Unzipping {output_file_pubchem_smiles}...")
with gzip.open(output_file_pubchem_smiles, 'rb') as f_in:
with open(output_file_pubchem_smiles[:-3], 'wb') as f_out: # Remove .gz extension
shutil.copyfileobj(f_in, f_out)

print(f"Unzipped {output_file_pubchem_smiles}.")

print(f"Unzipping {output_file_pubchem_mass}.")

unzip = subprocess.Popen(["gunzip", "-f", output_file_pubchem_mass])

os.waitpid(unzip.pid, 0)
# Unzip mass file
print(f"Unzipping {output_file_pubchem_mass}...")
with gzip.open(output_file_pubchem_mass, 'rb') as f_in:
with open(output_file_pubchem_mass[:-3], 'wb') as f_out: # Remove .gz extension
shutil.copyfileobj(f_in, f_out)

print(f"Unzipped {output_file_pubchem_mass}.")
10 changes: 7 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "drug-named-entity-recognition"
version = "2.0.9"
version = "2.0.11"
description = "Drug Named Entity Recognition library to find and resolve drug names in a string (drug named entity linking)"
readme = "README.md"
keywords = ['drug', 'bio', 'biomedical', 'medical', 'pharma', 'pharmaceutical', 'ner', 'nlp', 'named entity recognition', 'natural language processing', 'named entity linking']
Expand All @@ -11,7 +11,7 @@ maintainers = [
authors = [
{ name = "Thomas Wood", email = "[email protected]" },
]
requires-python = ">=3.6,<=3.13"
requires-python = ">=3.6"
classifiers=[
# see https://pypi.org/classifiers/
"Development Status :: 5 - Production/Stable",
Expand All @@ -32,7 +32,11 @@ classifiers=[
]
# this set should be kept minimal!
dependencies = [
"requests"
"requests",
"nltk",
"fuzzyset2",
"english_words"

]

[project.optional-dependencies]
Expand Down
3 changes: 0 additions & 3 deletions setup.cfg

This file was deleted.

84 changes: 0 additions & 84 deletions setup.py

This file was deleted.

2 changes: 1 addition & 1 deletion src/drug_named_entity_recognition/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,6 @@

'''

__version__ = "2.0.9"
__version__ = "2.0.11"

from drug_named_entity_recognition.drugs_finder import find_drugs
Binary file modified src/drug_named_entity_recognition/drug_ner_dictionary.pkl.bz2
Binary file not shown.
Loading