11"""Utilities to help linting various targets.""" 
22
33import  os 
4+ import  re 
45from  typing  import  (
56    Any ,
67    Dict ,
2122if  TYPE_CHECKING :
2223    from  planemo .cli  import  PlanemoCliContext 
2324
25+ REQUEST_TIMEOUT  =  5 
26+ 
2427
2528def  build_lint_args (ctx : "PlanemoCliContext" , ** kwds ) ->  Dict [str , Any ]:
2629    """Handle common report, error, and skip linting arguments.""" 
@@ -125,6 +128,49 @@ def lint_xsd(lint_ctx, schema_path, path):
125128        lint_ctx .info ("File validates against XML schema." )
126129
127130
131+ def  _validate_doi_url (url , lint_ctx ):
132+     """Validate DOI URL by checking CrossRef API.""" 
133+     match  =  re .match ("https?://doi.org/(.*)$" , url )
134+     if  match  is  None :
135+         return  False 
136+ 
137+     doi  =  match .group (1 )
138+     xref_url  =  f"https://api.crossref.org/works/{ doi }  
139+     return  _validate_http_url (xref_url , lint_ctx = lint_ctx )
140+ 
141+ 
142+ def  _validate_http_url (url , lint_ctx , user_agent = None ):
143+     """Validate HTTP/HTTPS URL.""" 
144+     headers  =  {"User-Agent" : user_agent , "Accept" : "*/*" } if  user_agent  else  None 
145+     r  =  None 
146+     try :
147+         r  =  requests .get (url , headers = headers , stream = True , timeout = REQUEST_TIMEOUT )
148+         r .raise_for_status ()
149+         next (r .iter_content (1000 ))
150+         return  True 
151+     except  Exception  as  e :
152+         if  r  is  not None  and  r .status_code  ==  429 :
153+             # too many requests 
154+             return  True 
155+         elif  r  is  not None  and  r .status_code  in  [403 , 503 ] and  "cloudflare"  in  r .text :
156+             # CloudFlare protection block 
157+             return  True 
158+         else :
159+             lint_ctx .error (f"Error '{ e } { url }  )
160+             return  False 
161+ 
162+ 
163+ def  _validate_other_url (url , lint_ctx ):
164+     """Validate non-HTTP URLs.""" 
165+     try :
166+         with  urlopen (url ) as  handle :
167+             handle .read (100 )
168+         return  True 
169+     except  Exception  as  e :
170+         lint_ctx .error (f"Error '{ e } { url }  )
171+         return  False 
172+ 
173+ 
128174def  lint_urls (root , lint_ctx ):
129175    """Find referenced URLs and verify they are valid.""" 
130176    urls , docs  =  find_urls_for_xml (root )
@@ -133,34 +179,14 @@ def lint_urls(root, lint_ctx):
133179    BROWSER_USER_AGENT  =  "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36" 
134180
135181    def  validate_url (url , lint_ctx , user_agent = None ):
136-         is_valid  =  True 
137-         if  url .startswith ("http://" ) or  url .startswith ("https://" ):
138-             if  user_agent :
139-                 headers  =  {"User-Agent" : user_agent , "Accept" : "*/*" }
140-             else :
141-                 headers  =  None 
142-             r  =  None 
143-             try :
144-                 r  =  requests .get (url , headers = headers , stream = True )
145-                 r .raise_for_status ()
146-                 next (r .iter_content (1000 ))
147-             except  Exception  as  e :
148-                 if  r  is  not None  and  r .status_code  ==  429 :
149-                     # too many requests 
150-                     pass 
151-                 elif  r  is  not None  and  r .status_code  in  [403 , 503 ] and  "cloudflare"  in  r .text :
152-                     # CloudFlare protection block 
153-                     pass 
154-                 else :
155-                     is_valid  =  False 
156-                     lint_ctx .error (f"Error '{ e } { url }  )
182+         is_valid  =  False 
183+         if  re .match ("https?://doi.org/(.*)$" , url ):
184+             is_valid  =  _validate_doi_url (url , lint_ctx )
185+         elif  url .startswith ("http://" ) or  url .startswith ("https://" ):
186+             is_valid  =  _validate_http_url (url , lint_ctx , user_agent )
157187        else :
158-             try :
159-                 with  urlopen (url ) as  handle :
160-                     handle .read (100 )
161-             except  Exception  as  e :
162-                 is_valid  =  False 
163-                 lint_ctx .error (f"Error '{ e } { url }  )
188+             is_valid  =  _validate_other_url (url , lint_ctx )
189+ 
164190        if  is_valid :
165191            lint_ctx .info ("URL OK %s"  %  url )
166192
0 commit comments