-
Notifications
You must be signed in to change notification settings - Fork 258
Better regex for finding proxies + deduplication of proxies based on exit ip #765
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
192c118
b33c0fa
0c51e22
1f36e8b
234882f
bcfac18
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,14 +7,14 @@ | |
| }; | ||
|
|
||
| use color_eyre::eyre::WrapErr as _; | ||
| use itertools::Itertools as _; | ||
|
|
||
| use crate::{ | ||
| HashMap, | ||
| config::Config, | ||
| ipdb, | ||
| proxy::{Proxy, ProxyType}, | ||
| utils::is_docker, | ||
| HashMap, | ||
| }; | ||
|
|
||
| fn compare_timeout(a: &Proxy, b: &Proxy) -> Ordering { | ||
|
|
@@ -70,9 +70,26 @@ | |
| if config.output.sort_by_speed { | ||
| proxies.sort_unstable_by(compare_timeout); | ||
| } else { | ||
| proxies.sort_unstable_by(compare_natural); | ||
| } | ||
|
|
||
| // Deduplicate proxies by exit_ip when available. Different proxies can exit via the same IP. | ||
| // We do this after sorting so that if sorted by speed, the fastest one is kept. | ||
| // Track seen exit_ip per protocol to avoid cross-protocol removal | ||
| let mut seen: std::collections::HashSet<(ProxyType, String)> = | ||
| std::collections::HashSet::new(); | ||
| let mut deduped = Vec::with_capacity(proxies.len()); | ||
| for p in proxies { | ||
| if let Some(ip) = &p.exit_ip { | ||
| let key = (p.protocol, ip.clone()); | ||
| if !seen.insert(key) { | ||
| continue; | ||
| } | ||
| } | ||
| deduped.push(p); | ||
| } | ||
|
Comment on lines
+82
to
+90
|
||
| proxies = deduped; | ||
|
|
||
| if config.output.json.enabled { | ||
| let (maybe_asn_db, maybe_geo_db) = tokio::try_join!( | ||
| async { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,12 @@ | ||
| use std::sync::LazyLock; | ||
|
|
||
| pub static PROXY_REGEX: LazyLock<fancy_regex::Regex> = LazyLock::new(|| { | ||
| let pattern = r"(?:^|[^0-9A-Za-z])(?:(?P<protocol>https?|socks[45]):\/\/)?(?:(?P<username>[0-9A-Za-z]{1,64}):(?P<password>[0-9A-Za-z]{1,64})@)?(?P<host>[A-Za-z][\-\.A-Za-z]{0,251}[A-Za-z]|[A-Za-z]|(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3}):(?P<port>[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])(?=[^0-9A-Za-z]|$)"; | ||
| let pattern = r"(?:^|[^0-9A-Za-z])(?:(?P<protocol>https?|socks[45]):\/\/)?(?:(?P<username>[0-9A-Za-z._~\-]{1,256}):(?P<password>[0-9A-Za-z._~\-]{1,256})@)?(?P<host>[A-Za-z][\-\.A-Za-z]{0,251}[A-Za-z]|[A-Za-z]|(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3}):(?P<port>[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])(?=[^0-9A-Za-z]|$)"; | ||
|
||
| fancy_regex::RegexBuilder::new(pattern) | ||
| .backtrack_limit(usize::MAX) | ||
| .build() | ||
| .unwrap() | ||
| }); | ||
|
|
||
| static IPV4_REGEX: LazyLock<fancy_regex::Regex> = LazyLock::new(|| { | ||
| let pattern = r"^\s*(?P<host>(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})(?::(?:[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?\s*$"; | ||
| fancy_regex::Regex::new(pattern).unwrap() | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nitpick] The deduplication logic uses
std::collections::HashSetdirectly instead of the project'sHashMapalias. For consistency with the existing codebase that importsHashMap, consider usingstd::collections::HashSetconsistently or adding aHashSetalias to match the pattern.