-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathproxy_request.py
More file actions
376 lines (317 loc) · 17.7 KB
/
proxy_request.py
File metadata and controls
376 lines (317 loc) · 17.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
import requests
import random
import time
import threading
from bs4 import BeautifulSoup
import re
from collections import defaultdict
try:
from behavior_config import BEHAVIOR_WEIGHTS, BEHAVIOR_PARAMS, RESOURCE_LOAD_PROBS, INTERACTION_PROBS, IMAGE_HOVER_RANGE
except ImportError:
# 如果配置文件不存在,使用默认值
BEHAVIOR_WEIGHTS = {
"quick_browse": 0.25,
"deep_read": 0.20,
"search_oriented": 0.25,
"social_browse": 0.20,
"research_mode": 0.10
}
BEHAVIOR_PARAMS = {}
RESOURCE_LOAD_PROBS = {"css": 0.3, "js": 0.3, "image": 0.4}
INTERACTION_PROBS = {"form_fill": 0.2, "search": 0.15}
IMAGE_HOVER_RANGE = {"deep_read": (2, 5), "default": (1, 3)}
User_Agent = [
"Mozilla/4.0 (iPod; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"Mozilla/4.0 (iPhone; U; CPU iPhone OS 4_3_2 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8H7 Safari/6533.18.5",
"MQQBrowser/25 (Linux; U; 2.3.3; zh-cn; HTC Desire S Build/GRI40;480*800)",
"Mozilla/5.0 (Linux; U; Android 2.3.3; zh-cn; HTC_DesireS_S510e Build/GRI40) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
"Mozilla/5.0 (SymbianOS/9.3; U; Series60/3.2 NokiaE75-1 /110.48.125 Profile/MIDP-2.1 Configuration/CLDC-1.1 ) AppleWebKit/413 (KHTML, like Gecko) Safari/413",
"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Mobile/8J2",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
"Mozilla/4.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/534.51.22 (KHTML, like Gecko) Version/5.1.1 Safari/534.51.22",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A5313e Safari/7534.48.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.202 Safari/535.1",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; SAMSUNG; OMNIA7)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; XBLWP7; ZuneWP7)",
"Mozilla/5.0 (Windows NT 5.2) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.122 Safari/534.30",
"Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.2; Trident/4.0; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/4.0 (compatible; MSIE 60; Windows NT 5.1; SV1; .NET CLR 2.0.50727)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)",
"Opera/9.80 (Windows NT 5.1; U; zh-cn) Presto/2.9.168 Version/11.50",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022; .NET4.0E; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729; .NET4.0C)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/533.21.1 (KHTML, like Gecko) Version/5.0.5 Safari/533.21.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 2.0.50727; TheWorld)"
]
# 全局统计变量
behavior_stats = defaultdict(lambda: {'success': 0, 'total': 0})
stats_lock = threading.Lock()
def update_behavior_stats(behavior_pattern, success):
"""更新行为模式统计"""
with stats_lock:
behavior_stats[behavior_pattern]['total'] += 1
if success:
behavior_stats[behavior_pattern]['success'] += 1
def print_behavior_stats():
"""打印行为模式统计"""
with stats_lock:
print("\n=== 行为模式统计 ===")
for pattern, stats in behavior_stats.items():
success_rate = (stats['success'] / stats['total'] * 100) if stats['total'] > 0 else 0
print(f"{pattern}: 成功 {stats['success']}/{stats['total']} ({success_rate:.1f}%)")
print("==================\n")
def get_proxies():
url = "https://bapi.51daili.com/getapi2?linePoolIndex=-1&packid=2&time=5&qty=10&port=1&format=txt&dt=4&ct=1&usertype=17&uid=57115&accessName=zzk0615&accessPassword=2749404C7788D977086C1E927C01AB30&skey=autoaddwhiteip"
try:
resp = requests.get(url, timeout=10)
if resp.status_code == 200:
proxies = [line.strip() for line in resp.text.split('\n') if line.strip()]
return proxies
else:
print(f"获取代理IP失败,状态码: {resp.status_code}")
except Exception as e:
print(f"获取代理IP异常: {e}")
return []
def get_realistic_headers():
"""获取更真实的请求头"""
base_headers = {
"User-Agent": random.choice(User_Agent),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Cache-Control": "max-age=0"
}
return base_headers
def get_user_behavior_pattern():
"""根据权重获取随机的用户行为模式"""
patterns = list(BEHAVIOR_WEIGHTS.keys())
weights = list(BEHAVIOR_WEIGHTS.values())
return random.choices(patterns, weights=weights)[0]
def get_behavior_param(behavior_pattern, param_name, default_range=(1, 3)):
"""获取行为模式的参数"""
if behavior_pattern in BEHAVIOR_PARAMS and param_name in BEHAVIOR_PARAMS[behavior_pattern]:
return BEHAVIOR_PARAMS[behavior_pattern][param_name]
return default_range
def simulate_user_behavior(session, url, proxies, headers, behavior_pattern=None):
"""模拟真实的用户行为"""
if behavior_pattern is None:
behavior_pattern = get_user_behavior_pattern()
success = False
try:
# 访问页面
resp = session.get(url, proxies=proxies, headers=headers, timeout=15)
if resp.status_code != 200:
update_behavior_stats(behavior_pattern, False)
return False
print(f"成功访问页面: {url} (行为模式: {behavior_pattern})")
# 解析页面内容
soup = BeautifulSoup(resp.content, 'html.parser')
# 根据行为模式调整页面加载时间
load_time_range = get_behavior_param(behavior_pattern, "page_load_time")
time.sleep(random.uniform(*load_time_range))
# 根据行为模式调整滚动行为
scroll_range = get_behavior_param(behavior_pattern, "scroll_actions")
scroll_actions = random.randint(*scroll_range)
scroll_interval_range = get_behavior_param(behavior_pattern, "scroll_interval")
for i in range(scroll_actions):
time.sleep(random.uniform(*scroll_interval_range))
print(f"模拟滚动操作 {i+1}/{scroll_actions}")
# 模拟鼠标悬停行为(通过访问图片资源)
images = soup.find_all('img', src=True)
if images:
hover_range = IMAGE_HOVER_RANGE.get(behavior_pattern, IMAGE_HOVER_RANGE["default"])
hover_count = random.randint(*hover_range)
selected_images = random.sample(images, hover_count)
for img in selected_images:
img_src = img.get('src')
if img_src and img_src.startswith('http'):
try:
# 模拟加载图片
img_resp = session.get(img_src, proxies=proxies, headers=headers, timeout=5)
if img_resp.status_code == 200:
print(f"模拟加载图片: {img_src}")
time.sleep(random.uniform(0.1, 0.5))
except:
pass
# 查找页面中的链接
links = soup.find_all('a', href=True)
valid_links = []
for link in links:
href = link.get('href')
if href and href.startswith('http'):
# 过滤掉一些不需要的链接
if not any(skip in href.lower() for skip in ['javascript:', 'mailto:', 'tel:', '#']):
valid_links.append(href)
# 根据行为模式调整点击行为
click_range = get_behavior_param(behavior_pattern, "click_count")
click_count = random.randint(*click_range)
if valid_links and click_count > 0:
selected_links = random.sample(valid_links, min(click_count, len(valid_links)))
for i, link in enumerate(selected_links):
try:
print(f"模拟点击链接 {i+1}/{len(selected_links)}: {link}")
# 更新Referer和Sec-Fetch-Site
headers['Referer'] = url
headers['Sec-Fetch-Site'] = 'same-origin' if url in link else 'cross-site'
# 点击链接
click_resp = session.get(link, proxies=proxies, headers=headers, timeout=10)
if click_resp.status_code == 200:
# 根据行为模式调整停留时间
stay_range = get_behavior_param(behavior_pattern, "stay_time")
stay_time = random.uniform(*stay_range)
print(f"在新页面停留 {stay_time:.1f} 秒")
time.sleep(stay_time)
# 模拟新页面的滚动
new_soup = BeautifulSoup(click_resp.content, 'html.parser')
new_links = new_soup.find_all('a', href=True)
# 根据行为模式调整再次点击概率
click_again_prob = get_behavior_param(behavior_pattern, "click_again_prob", (0.2, 0.2))[0]
# 在新页面中可能再次点击
if random.random() < click_again_prob and new_links:
sub_link = random.choice(new_links).get('href')
if sub_link and sub_link.startswith('http'):
print(f"在新页面中再次点击: {sub_link}")
headers['Referer'] = link
sub_resp = session.get(sub_link, proxies=proxies, headers=headers, timeout=8)
if sub_resp.status_code == 200:
time.sleep(random.uniform(2, 5))
except Exception as e:
print(f"点击链接失败: {link}, 错误: {e}")
continue
# 模拟表单填写行为(如果页面有表单)
forms = soup.find_all('form')
if forms and random.random() < INTERACTION_PROBS["form_fill"]:
print("模拟表单填写行为")
time.sleep(random.uniform(1, 3))
# 模拟搜索行为
search_inputs = soup.find_all('input', {'type': 'search'}) or soup.find_all('input', {'name': re.compile(r'search|q', re.I)})
if search_inputs and random.random() < INTERACTION_PROBS["search"]:
print("模拟搜索行为")
time.sleep(random.uniform(1, 2))
# 模拟CSS和JS资源加载
css_links = soup.find_all('link', {'rel': 'stylesheet'})
js_scripts = soup.find_all('script', {'src': True})
if css_links and random.random() < RESOURCE_LOAD_PROBS["css"]:
css_link = random.choice(css_links).get('href')
if css_link and css_link.startswith('http'):
try:
session.get(css_link, proxies=proxies, headers=headers, timeout=5)
print(f"模拟加载CSS: {css_link}")
except:
pass
if js_scripts and random.random() < RESOURCE_LOAD_PROBS["js"]:
js_link = random.choice(js_scripts).get('src')
if js_link and js_link.startswith('http'):
try:
session.get(js_link, proxies=proxies, headers=headers, timeout=5)
print(f"模拟加载JS: {js_link}")
except:
pass
# 根据行为模式调整最终停留时间
final_stay_range = get_behavior_param(behavior_pattern, "final_stay")
final_stay = random.uniform(*final_stay_range)
print(f"最终停留 {final_stay:.1f} 秒")
time.sleep(final_stay)
success = True
except Exception as e:
print(f"模拟用户行为失败: {e}")
success = False
# 更新统计
update_behavior_stats(behavior_pattern, success)
return success
def request_with_proxy(proxy_ip):
test_url = "https://blog.csdn.net/Why_does_it_work/article/details/148903077?spm=1001.2014.3001.5501"
click_urls = [
"https://www.bright.cn/products/web-scraper/?utm_source=brand&utm_campaign=brnd-mkt_cn_csdn_kai202506",
"https://blog.csdn.net/weixin_50804299/article/details/149075189?spm=1001.2014.3001.5501",
"https://get.brightdata.com/s-mcpserver",
"https://www.captainbed.cn/z",
"https://www.bilibili.com/video/BV1ZoKEzmEkJ/?spm_id_from=333.1387.homepage.video_card.click"
]
proxies = {
"http": f"http://{proxy_ip}",
"https": f"http://{proxy_ip}"
}
# 使用更真实的请求头
headers = get_realistic_headers()
headers["Referer"] = "https://blog.csdn.net/"
session = requests.Session()
# 设置一些默认的Cookie
session.cookies.set('visited', 'true', domain='.csdn.net')
try:
# 首先访问主页面
resp = session.get(test_url, proxies=proxies, headers=headers, timeout=10)
if resp.status_code == 200:
print(f"代理IP: {proxy_ip},主页面请求成功")
# 获取用户行为模式
behavior_pattern = get_user_behavior_pattern()
print(f"代理IP: {proxy_ip},使用行为模式: {behavior_pattern}")
# 在主页面模拟用户行为
simulate_user_behavior(session, test_url, proxies, headers, behavior_pattern)
# 随机选择1-2个目标链接进行深入访问
target_count = random.randint(1, min(2, len(click_urls)))
selected_targets = random.sample(click_urls, target_count)
for target_url in selected_targets:
print(f"代理IP: {proxy_ip},开始深入访问: {target_url}")
# 更新Referer
headers['Referer'] = test_url
# 对每个目标链接进行深入的模拟用户行为
# 可以为不同目标使用不同的行为模式
target_behavior = get_user_behavior_pattern()
success = simulate_user_behavior(session, target_url, proxies, headers, target_behavior)
if success:
print(f"代理IP: {proxy_ip},深入访问 {target_url} 成功")
else:
print(f"代理IP: {proxy_ip},深入访问 {target_url} 失败")
# 在访问不同目标之间稍作停留
time.sleep(random.uniform(1, 3))
return True
else:
print(f"代理IP: {proxy_ip},主页面请求失败")
except Exception as e:
print(f"代理IP: {proxy_ip} 请求失败: {e}")
return False
def main():
cycle_count = 0
while True:
cycle_count += 1
print(f"\n=== 开始第 {cycle_count} 轮循环 ===")
proxies = get_proxies()
if not proxies:
print("未获取到代理IP,5秒后重试...")
time.sleep(5)
continue
print(f"本轮获取到{len(proxies)}个代理IP")
valid_proxies = proxies.copy()
while valid_proxies:
results = {}
threads = []
def thread_func(proxy_ip):
success = request_with_proxy(proxy_ip)
results[proxy_ip] = success
for proxy_ip in valid_proxies:
t = threading.Thread(target=thread_func, args=(proxy_ip,))
threads.append(t)
t.start()
for t in threads:
t.join()
valid_proxies = [ip for ip in valid_proxies if results.get(ip)]
if not valid_proxies:
print("本轮代理全部失效,进入新一轮循环...")
break
time.sleep(30)
# 每5轮打印一次统计信息
if cycle_count % 5 == 0:
print_behavior_stats()
if __name__ == "__main__":
main()