1
1
"""Utilities to help linting various targets."""
2
2
3
3
import os
4
+ import re
4
5
from typing import (
5
6
Any ,
6
7
Dict ,
21
22
if TYPE_CHECKING :
22
23
from planemo .cli import PlanemoCliContext
23
24
25
+ REQUEST_TIMEOUT = 5
26
+
24
27
25
28
def build_lint_args (ctx : "PlanemoCliContext" , ** kwds ) -> Dict [str , Any ]:
26
29
"""Handle common report, error, and skip linting arguments."""
@@ -125,6 +128,49 @@ def lint_xsd(lint_ctx, schema_path, path):
125
128
lint_ctx .info ("File validates against XML schema." )
126
129
127
130
131
+ def _validate_doi_url (url , lint_ctx ):
132
+ """Validate DOI URL by checking CrossRef API."""
133
+ match = re .match ("https?://doi.org/(.*)$" , url )
134
+ if match is None :
135
+ return False
136
+
137
+ doi = match .group (1 )
138
+ xref_url = f"https://api.crossref.org/works/{ doi } "
139
+ return _validate_http_url (xref_url , lint_ctx = lint_ctx )
140
+
141
+
142
+ def _validate_http_url (url , lint_ctx , user_agent = None ):
143
+ """Validate HTTP/HTTPS URL."""
144
+ headers = {"User-Agent" : user_agent , "Accept" : "*/*" } if user_agent else None
145
+ r = None
146
+ try :
147
+ r = requests .get (url , headers = headers , stream = True , timeout = REQUEST_TIMEOUT )
148
+ r .raise_for_status ()
149
+ next (r .iter_content (1000 ))
150
+ return True
151
+ except Exception as e :
152
+ if r is not None and r .status_code == 429 :
153
+ # too many requests
154
+ return True
155
+ elif r is not None and r .status_code in [403 , 503 ] and "cloudflare" in r .text :
156
+ # CloudFlare protection block
157
+ return True
158
+ else :
159
+ lint_ctx .error (f"Error '{ e } ' accessing { url } " )
160
+ return False
161
+
162
+
163
+ def _validate_other_url (url , lint_ctx ):
164
+ """Validate non-HTTP URLs."""
165
+ try :
166
+ with urlopen (url ) as handle :
167
+ handle .read (100 )
168
+ return True
169
+ except Exception as e :
170
+ lint_ctx .error (f"Error '{ e } ' accessing { url } " )
171
+ return False
172
+
173
+
128
174
def lint_urls (root , lint_ctx ):
129
175
"""Find referenced URLs and verify they are valid."""
130
176
urls , docs = find_urls_for_xml (root )
@@ -133,34 +179,14 @@ def lint_urls(root, lint_ctx):
133
179
BROWSER_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36"
134
180
135
181
def validate_url (url , lint_ctx , user_agent = None ):
136
- is_valid = True
137
- if url .startswith ("http://" ) or url .startswith ("https://" ):
138
- if user_agent :
139
- headers = {"User-Agent" : user_agent , "Accept" : "*/*" }
140
- else :
141
- headers = None
142
- r = None
143
- try :
144
- r = requests .get (url , headers = headers , stream = True )
145
- r .raise_for_status ()
146
- next (r .iter_content (1000 ))
147
- except Exception as e :
148
- if r is not None and r .status_code == 429 :
149
- # too many requests
150
- pass
151
- elif r is not None and r .status_code in [403 , 503 ] and "cloudflare" in r .text :
152
- # CloudFlare protection block
153
- pass
154
- else :
155
- is_valid = False
156
- lint_ctx .error (f"Error '{ e } ' accessing { url } " )
182
+ is_valid = False
183
+ if re .match ("https?://doi.org/(.*)$" , url ):
184
+ is_valid = _validate_doi_url (url , lint_ctx )
185
+ elif url .startswith ("http://" ) or url .startswith ("https://" ):
186
+ is_valid = _validate_http_url (url , lint_ctx , user_agent )
157
187
else :
158
- try :
159
- with urlopen (url ) as handle :
160
- handle .read (100 )
161
- except Exception as e :
162
- is_valid = False
163
- lint_ctx .error (f"Error '{ e } ' accessing { url } " )
188
+ is_valid = _validate_other_url (url , lint_ctx )
189
+
164
190
if is_valid :
165
191
lint_ctx .info ("URL OK %s" % url )
166
192
0 commit comments