Skip to content

Commit 0c408fe

Browse files
authored
Merge pull request #1559 from bernt-matthias/doi-url-linter
Add special handling for DOIs to URL linter
2 parents 16f8296 + f3fc4d9 commit 0c408fe

File tree

1 file changed

+53
-27
lines changed

1 file changed

+53
-27
lines changed

planemo/lint.py

Lines changed: 53 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Utilities to help linting various targets."""
22

33
import os
4+
import re
45
from typing import (
56
Any,
67
Dict,
@@ -21,6 +22,8 @@
2122
if TYPE_CHECKING:
2223
from planemo.cli import PlanemoCliContext
2324

25+
REQUEST_TIMEOUT = 5
26+
2427

2528
def build_lint_args(ctx: "PlanemoCliContext", **kwds) -> Dict[str, Any]:
2629
"""Handle common report, error, and skip linting arguments."""
@@ -125,6 +128,49 @@ def lint_xsd(lint_ctx, schema_path, path):
125128
lint_ctx.info("File validates against XML schema.")
126129

127130

131+
def _validate_doi_url(url, lint_ctx):
132+
"""Validate DOI URL by checking CrossRef API."""
133+
match = re.match("https?://doi.org/(.*)$", url)
134+
if match is None:
135+
return False
136+
137+
doi = match.group(1)
138+
xref_url = f"https://api.crossref.org/works/{doi}"
139+
return _validate_http_url(xref_url, lint_ctx=lint_ctx)
140+
141+
142+
def _validate_http_url(url, lint_ctx, user_agent=None):
143+
"""Validate HTTP/HTTPS URL."""
144+
headers = {"User-Agent": user_agent, "Accept": "*/*"} if user_agent else None
145+
r = None
146+
try:
147+
r = requests.get(url, headers=headers, stream=True, timeout=REQUEST_TIMEOUT)
148+
r.raise_for_status()
149+
next(r.iter_content(1000))
150+
return True
151+
except Exception as e:
152+
if r is not None and r.status_code == 429:
153+
# too many requests
154+
return True
155+
elif r is not None and r.status_code in [403, 503] and "cloudflare" in r.text:
156+
# CloudFlare protection block
157+
return True
158+
else:
159+
lint_ctx.error(f"Error '{e}' accessing {url}")
160+
return False
161+
162+
163+
def _validate_other_url(url, lint_ctx):
164+
"""Validate non-HTTP URLs."""
165+
try:
166+
with urlopen(url) as handle:
167+
handle.read(100)
168+
return True
169+
except Exception as e:
170+
lint_ctx.error(f"Error '{e}' accessing {url}")
171+
return False
172+
173+
128174
def lint_urls(root, lint_ctx):
129175
"""Find referenced URLs and verify they are valid."""
130176
urls, docs = find_urls_for_xml(root)
@@ -133,34 +179,14 @@ def lint_urls(root, lint_ctx):
133179
BROWSER_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
134180

135181
def validate_url(url, lint_ctx, user_agent=None):
136-
is_valid = True
137-
if url.startswith("http://") or url.startswith("https://"):
138-
if user_agent:
139-
headers = {"User-Agent": user_agent, "Accept": "*/*"}
140-
else:
141-
headers = None
142-
r = None
143-
try:
144-
r = requests.get(url, headers=headers, stream=True)
145-
r.raise_for_status()
146-
next(r.iter_content(1000))
147-
except Exception as e:
148-
if r is not None and r.status_code == 429:
149-
# too many requests
150-
pass
151-
elif r is not None and r.status_code in [403, 503] and "cloudflare" in r.text:
152-
# CloudFlare protection block
153-
pass
154-
else:
155-
is_valid = False
156-
lint_ctx.error(f"Error '{e}' accessing {url}")
182+
is_valid = False
183+
if re.match("https?://doi.org/(.*)$", url):
184+
is_valid = _validate_doi_url(url, lint_ctx)
185+
elif url.startswith("http://") or url.startswith("https://"):
186+
is_valid = _validate_http_url(url, lint_ctx, user_agent)
157187
else:
158-
try:
159-
with urlopen(url) as handle:
160-
handle.read(100)
161-
except Exception as e:
162-
is_valid = False
163-
lint_ctx.error(f"Error '{e}' accessing {url}")
188+
is_valid = _validate_other_url(url, lint_ctx)
189+
164190
if is_valid:
165191
lint_ctx.info("URL OK %s" % url)
166192

0 commit comments

Comments
 (0)