-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathurl_utils.py
More file actions
129 lines (102 loc) · 4.14 KB
/
url_utils.py
File metadata and controls
129 lines (102 loc) · 4.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
def process_text_urls(text, timeout=10, delay=1):
"""
Replace URLs in text with markdown format, attempting to fetch titles and descriptions.
Args:
text (str): Input text containing URLs
timeout (int): Request timeout in seconds
delay (float): Delay between requests in seconds
Returns:
tuple: (processed_text, descriptions_dict)
- processed_text: Text with URLs replaced by markdown links
- descriptions_dict: Dictionary mapping URLs to their descriptions
"""
# Regex pattern to match URLs
url_pattern = r'https?://(?:[-\w.])+(?:[:\d]+)?(?:/(?:[\w/_.])*(?:\?(?:[\w&=%.])*)?(?:#(?:[\w.])*)?)?'
# Find all URLs in the text
urls = re.findall(url_pattern, text)
# Dictionary to store descriptions
descriptions = {}
# Process each unique URL
processed_text = text
for url in set(urls): # Use set to avoid duplicate processing
title, description = fetch_page_info(url, timeout)
# Create markdown link
if title:
markdown_link = f"[{title}]({url})"
else:
markdown_link = f"[{url}]({url})"
# Store description if found
if description:
descriptions[url] = description
# Replace URL with markdown link in text
processed_text = processed_text.replace(url, markdown_link)
# Add delay between requests to be respectful
if len(set(urls)) > 1:
time.sleep(delay)
return processed_text, descriptions
def fetch_page_info(url, timeout=10):
"""
Fetch title and description from a webpage.
Args:
url (str): URL to fetch information from
timeout (int): Request timeout in seconds
Returns:
tuple: (title, description) - both can be None if not found
"""
try:
# Set headers to mimic a browser request
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract title
title = None
title_tag = soup.find('title')
if title_tag:
title = title_tag.get_text().strip()
# Extract description (try multiple meta tags)
description = None
# Try og:description first
og_desc = soup.find('meta', property='og:description')
if og_desc:
description = og_desc.get('content', '').strip()
# Try meta description if og:description not found
if not description:
meta_desc = soup.find('meta', attrs={'name': 'description'})
if meta_desc:
description = meta_desc.get('content', '').strip()
# Try twitter:description as fallback
if not description:
twitter_desc = soup.find('meta', attrs={'name': 'twitter:description'})
if twitter_desc:
description = twitter_desc.get('content', '').strip()
return title, description
except requests.RequestException as e:
print(f"Error fetching {url}: {e}")
return None, None
except Exception as e:
print(f"Error parsing {url}: {e}")
return None, None
# Example usage
if __name__ == "__main__":
# Example text with URLs
sample_text = """
Check out this article: https://www.python.org/
Also visit https://github.com/python/cpython for the source code.
Here's another link: https://docs.python.org/3/
"""
processed_text, descriptions = process_text_urls(sample_text)
print("Original text:")
print(sample_text)
print("\nProcessed text:")
print(processed_text)
print("\nDescriptions found:")
for url, desc in descriptions.items():
print(f"- {url}: {desc}")