From bdecf19fa1a938950b5b0e36fe0d6203d30daae8 Mon Sep 17 00:00:00 2001 From: KhulnaSoft bot <43526132+khulnasoft-bot@users.noreply.github.com> Date: Tue, 23 Sep 2025 08:06:59 +0600 Subject: [PATCH 1/7] Add CIDR range scanning support for proxy discovery --- src/parsers.rs | 181 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) diff --git a/src/parsers.rs b/src/parsers.rs index 902e76f..e757c65 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -1,5 +1,7 @@ use std::sync::LazyLock; +use ipnetwork::IpNetwork; + pub static PROXY_REGEX: LazyLock = LazyLock::new(|| { let pattern = r"(?:^|[^0-9A-Za-z])(?:(?Phttps?|socks[45]):\/\/)?(?:(?P[0-9A-Za-z]{1,64}):(?P[0-9A-Za-z]{1,64})@)?(?P[A-Za-z][\-\.A-Za-z]{0,251}[A-Za-z]|[A-Za-z]|(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3}):(?P[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])(?=[^0-9A-Za-z]|$)"; fancy_regex::RegexBuilder::new(pattern) @@ -13,6 +15,11 @@ static IPV4_REGEX: LazyLock = LazyLock::new(|| { fancy_regex::Regex::new(pattern).unwrap() }); +static CIDR_REGEX: LazyLock = LazyLock::new(|| { + let pattern = r"(?:^|[^0-9A-Za-z])(?P(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})/(?P[0-9]|[12][0-9]|3[0-2]):(?P[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])(?=[^0-9A-Za-z]|$)"; + fancy_regex::Regex::new(pattern).unwrap() +}); + pub fn parse_ipv4(s: &str) -> Option { if let Ok(Some(captures)) = IPV4_REGEX.captures(s) { captures.name("host").map(|capture| capture.as_str().to_owned()) @@ -20,3 +27,177 @@ pub fn parse_ipv4(s: &str) -> Option { None } } + +/// Expands CIDR ranges in text into individual IP:port entries +/// Supports format like "192.168.1.0/24:8080" which expands to all IPs in the range +/// Handles various separators (spaces, commas, newlines, etc.) between entries +pub fn expand_cidr_ranges(text: &str) -> String { + let mut result = text.to_string(); + let mut offset: i32 = 0; + + // Find all CIDR matches and expand them + let captures: Vec<_> = CIDR_REGEX.captures_iter(text) + .filter_map(|m| m.ok()) + .collect(); + + for capture in captures { + if let (Some(network), Some(prefix), Some(port)) = ( + capture.name("network"), + capture.name("prefix"), + capture.name("port") + ) { + let cidr_str = format!("{}/{}", network.as_str(), prefix.as_str()); + + match cidr_str.parse::() { + Ok(network) => { + // Generate expanded IPs + let expanded_ips: Vec = network.iter() + .filter(|ip| ip.is_ipv4()) + .map(|ip| format!("{}:{}", ip, port.as_str())) + .collect(); + + if !expanded_ips.is_empty() { + // Get the full match including any leading non-alphanumeric character + let full_match = capture.get(0).unwrap(); + let match_start = (full_match.start() as i32 + offset) as usize; + let match_end = (full_match.end() as i32 + offset) as usize; + + // Determine what separator to use by checking what follows + let separator = if match_end < result.len() { + let next_char = result.chars().nth(match_end); + match next_char { + Some('\n') => "\n", + Some('\t') => "\t", + Some(',') => ",", + _ => " ", + } + } else { + "\n" + }; + + // Join expanded IPs with the detected separator + let replacement = expanded_ips.join(separator); + + // Handle case where match starts with a delimiter character + let (_actual_start, prefix_char) = if match_start > 0 { + let prev_char = result.chars().nth(match_start); + if prev_char.map_or(false, |c| !c.is_ascii_alphanumeric()) { + (match_start + 1, result.chars().nth(match_start).unwrap().to_string()) + } else { + (match_start, String::new()) + } + } else { + (match_start, String::new()) + }; + + let final_replacement = format!("{}{}", prefix_char, replacement); + + // Replace the CIDR pattern with expanded IPs + result.replace_range(match_start..match_end, &final_replacement); + + // Update offset for subsequent replacements + let len_diff = final_replacement.len() as i32 - (match_end - match_start) as i32; + offset += len_diff; + } + } + Err(_) => { + // If parsing fails, leave the original text unchanged + continue; + } + } + } + } + + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cidr_expansion() { + // Test basic CIDR expansion + let input = "192.168.1.0/30:8080"; + let result = expand_cidr_ranges(input); + let lines: Vec<&str> = result.trim().split('\n').collect(); + + assert_eq!(lines.len(), 4); + assert!(lines.contains(&"192.168.1.0:8080")); + assert!(lines.contains(&"192.168.1.1:8080")); + assert!(lines.contains(&"192.168.1.2:8080")); + assert!(lines.contains(&"192.168.1.3:8080")); + } + + #[test] + fn test_mixed_input() { + let input = "192.168.1.0/31:8080\n127.0.0.1:9090\ninvalid-line"; + let result = expand_cidr_ranges(input); + let lines: Vec<&str> = result.trim().split('\n').collect(); + + // Should have 2 CIDR-expanded IPs + 1 regular IP + 1 invalid line + assert_eq!(lines.len(), 4); + assert!(lines.contains(&"192.168.1.0:8080")); + assert!(lines.contains(&"192.168.1.1:8080")); + assert!(lines.contains(&"127.0.0.1:9090")); + assert!(lines.contains(&"invalid-line")); + } + + #[test] + fn test_single_ip_cidr() { + let input = "10.0.0.1/32:3128"; + let result = expand_cidr_ranges(input); + assert_eq!(result.trim(), "10.0.0.1:3128"); + } + + #[test] + fn test_non_newline_separated_behavior() { + // Test space-separated entries with CIDR expansion + let input = "192.168.1.0/31:8080 127.0.0.1:9090"; + let result = expand_cidr_ranges(input); + + // Should expand the CIDR range and preserve the regular proxy + assert!(result.contains("192.168.1.0:8080")); + assert!(result.contains("192.168.1.1:8080")); + assert!(result.contains("127.0.0.1:9090")); + } + + #[test] + fn test_multiple_cidr_same_line_behavior() { + // Test multiple CIDR ranges on same line + let input = "192.168.1.0/31:8080 10.0.0.0/31:3128"; + let result = expand_cidr_ranges(input); + + // Should expand both CIDR ranges + assert!(result.contains("192.168.1.0:8080")); + assert!(result.contains("192.168.1.1:8080")); + assert!(result.contains("10.0.0.0:3128")); + assert!(result.contains("10.0.0.1:3128")); + } + + #[test] + fn test_comma_separated_cidr() { + let input = "192.168.1.0/31:8080,10.0.0.0/31:3128"; + let result = expand_cidr_ranges(input); + + // Should expand both CIDR ranges and preserve comma separation + assert!(result.contains("192.168.1.0:8080")); + assert!(result.contains("192.168.1.1:8080")); + assert!(result.contains("10.0.0.0:3128")); + assert!(result.contains("10.0.0.1:3128")); + } + + #[test] + fn test_mixed_separators() { + let input = "192.168.1.0/31:8080\t10.0.0.1:3128,203.0.113.0/31:1080 127.0.0.1:9090"; + let result = expand_cidr_ranges(input); + + // Should expand CIDR ranges and preserve non-CIDR entries + assert!(result.contains("192.168.1.0:8080")); + assert!(result.contains("192.168.1.1:8080")); + assert!(result.contains("10.0.0.1:3128")); + assert!(result.contains("203.0.113.0:1080")); + assert!(result.contains("203.0.113.1:1080")); + assert!(result.contains("127.0.0.1:9090")); + } +} From 64125560d65f7d22ed207318a0188774c31ded3b Mon Sep 17 00:00:00 2001 From: KhulnaSoft bot <43526132+khulnasoft-bot@users.noreply.github.com> Date: Tue, 23 Sep 2025 08:09:42 +0600 Subject: [PATCH 2/7] Add CIDR range scanning support for proxy discovery --- src/scraper.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/scraper.rs b/src/scraper.rs index 11a631d..7a7f42b 100644 --- a/src/scraper.rs +++ b/src/scraper.rs @@ -8,7 +8,7 @@ use crate::event::{AppEvent, Event}; use crate::{ HashSet, config::{Config, Source}, - parsers::PROXY_REGEX, + parsers::{PROXY_REGEX, expand_cidr_ranges}, proxy::{Proxy, ProxyType}, utils::pretty_error, }; @@ -68,12 +68,15 @@ async fn scrape_one( } }; + // Expand CIDR ranges to individual IP:port entries + let expanded_text = expand_cidr_ranges(&text); + #[cfg(feature = "tui")] let mut seen_protocols = HashSet::new(); let mut new_proxies = HashSet::new(); - for maybe_capture in PROXY_REGEX.captures_iter(&text) { + for (i, maybe_capture) in PROXY_REGEX.captures_iter(&expanded_text).enumerate() { if config.scraping.max_proxies_per_source != 0 && new_proxies.len() >= config.scraping.max_proxies_per_source { @@ -121,7 +124,7 @@ async fn scrape_one( } drop(config); - drop(text); + drop(expanded_text); if new_proxies.is_empty() { tracing::warn!("{}: no proxies found", source.url); From 00efe073ac3843a9cf36f1b2df6499a3bcfd8847 Mon Sep 17 00:00:00 2001 From: KhulnaSoft bot <43526132+khulnasoft-bot@users.noreply.github.com> Date: Tue, 23 Sep 2025 08:11:34 +0600 Subject: [PATCH 3/7] Create test_config.toml --- test_config.toml | 144 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 test_config.toml diff --git a/test_config.toml b/test_config.toml new file mode 100644 index 0000000..68e4f0f --- /dev/null +++ b/test_config.toml @@ -0,0 +1,144 @@ +# Enable debug logging (shows detailed checking process) +# Warning: Produces very verbose output +debug = false + + +[scraping] +# Maximum proxies to collect per source (0 = unlimited) +# Helps skip unreliable sources with too many proxies +max_proxies_per_source = 100000 + +# Request timeout for fetching proxy sources (seconds) +# Higher values may find more sources but take longer +timeout = 60.0 +connect_timeout = 5.0 + +# HTTP(S),SOCKS4 or SOCKS5 proxy used for fetching sources (e.g., "socks5://user:pass@host:port"). Leave empty to disable. +proxy = "" + +# User-Agent header for scraping requests +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36" + + +[checking] +# URL for checking proxy functionality +# httpbin-compatible: Returns JSON with IP info for ASN/geo data +# plain-text: Returns just IP address for basic connectivity +# Examples: +# "https://httpbin.org/ip" - JSON with "origin" key. Full featured checking. +# "https://api.ipify.org" - Simple IP return. Full featured checking. +# "https://google.com" - Basic connect/read check +# "" - Skip checking (scrape only) +check_url = "https://api.ipify.org" + +# Concurrent proxy checks (adjust based on RAM/network capacity) +# Start low and increase gradually if system handles it well +max_concurrent_checks = 1024 + +# Proxy response timeout (seconds) +# Higher = more working proxies found, slower checking +# Lower = faster checking, may miss slower proxies +timeout = 60.0 +connect_timeout = 5.0 + +# User-Agent header for proxy check requests +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36" + + +[output] +# Output directory (Docker ignores this setting) +path = "./out" + +# Sort by response time (true) or IP address (false) +sort_by_speed = true + + +# Plain text output (.txt files) +[output.txt] +enabled = false + + +# JSON output with metadata (.json files) +[output.json] +enabled = false + +# Add ASN (network provider) info to JSON output +# Uses offline MaxMind database +include_asn = true + +# Add geolocation (country/city) info to JSON output +# Uses offline MaxMind database +include_geolocation = true + + +# Proxy sources configuration +# Add local files: ["./my_proxies.txt"] or URLs +# Sources are fetched in parallel for speed + +enabled = true +urls = ["file:///tmp/test_cidr.txt"] + # Local file examples: + # "./my_http_proxies.txt", + # "/home/user/my_http_proxies.txt", + # "C:/Users/user/Desktop/my_http_proxies.txt", + # "file:///home/user/my_http_proxies.txt", + + # Advanced URL configuration examples (with basic auth or custom headers): + # HTTP Basic Auth example: + # { url = "https://some.api/endpoint", basic_auth = { username = "user", password = "password123" } }, + # Custom headers example: + # { url = "https://some.api/endpoint", headers = { Authorization = "Bearer YOUR_API_KEY" } }, + + "https://api.proxyscrape.com/v3/free-proxy-list/get?request=getproxies&protocol=http", + "https://api.proxyscrape.com/v3/free-proxy-list/get?request=getproxies&protocol=https", + "https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/http/data.txt", + "https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/https/data.txt", + "https://raw.githubusercontent.com/roosterkid/openproxylist/refs/heads/main/HTTPS_RAW.txt", + "https://raw.githubusercontent.com/sunny9577/proxy-scraper/refs/heads/master/generated/http_proxies.txt", + "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/refs/heads/master/http.txt", +] + +[scraping.socks4] +enabled = false +urls = [ + # Local file examples: + # "./my_socks4_proxies.txt", + # "/home/user/my_socks4_proxies.txt", + # "C:/Users/user/Desktop/my_socks4_proxies.txt", + # "file:///home/user/my_socks4_proxies.txt", + + # Advanced URL configuration examples (with basic auth or custom headers): + # HTTP Basic Auth example: + # { url = "https://some.api/endpoint", basic_auth = { username = "user", password = "password123" } }, + # Custom headers example: + # { url = "https://some.api/endpoint", headers = { Authorization = "Bearer YOUR_API_KEY" } }, + + "https://api.proxyscrape.com/v3/free-proxy-list/get?request=getproxies&protocol=socks4", + "https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/socks4/data.txt", + "https://raw.githubusercontent.com/roosterkid/openproxylist/refs/heads/main/SOCKS4_RAW.txt", + "https://raw.githubusercontent.com/sunny9577/proxy-scraper/refs/heads/master/generated/socks4_proxies.txt", + "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/refs/heads/master/socks4.txt", +] + +[scraping.socks5] +enabled = false +urls = [ + # Local file examples: + # "./my_socks5_proxies.txt", + # "/home/user/my_socks5_proxies.txt", + # "C:/Users/user/Desktop/my_socks5_proxies.txt", + # "file:///home/user/my_socks5_proxies.txt", + + # Advanced URL configuration examples (with basic auth or custom headers): + # HTTP Basic Auth example: + # { url = "https://some.api/endpoint", basic_auth = { username = "user", password = "password123" } }, + # Custom headers example: + # { url = "https://some.api/endpoint", headers = { Authorization = "Bearer YOUR_API_KEY" } }, + + "https://api.proxyscrape.com/v3/free-proxy-list/get?request=getproxies&protocol=socks5", + "https://raw.githubusercontent.com/hookzof/socks5_list/refs/heads/master/proxy.txt", + "https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/socks5/data.txt", + "https://raw.githubusercontent.com/roosterkid/openproxylist/refs/heads/main/SOCKS5_RAW.txt", + "https://raw.githubusercontent.com/sunny9577/proxy-scraper/refs/heads/master/generated/socks5_proxies.txt", + "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/refs/heads/master/socks5.txt", +] From 03de04891e71e4cf6c91a304a926f0a1e7271eb9 Mon Sep 17 00:00:00 2001 From: KhulnaSoft bot <43526132+khulnasoft-bot@users.noreply.github.com> Date: Tue, 23 Sep 2025 08:12:39 +0600 Subject: [PATCH 4/7] Create CIDR_EXAMPLES.md --- CIDR_EXAMPLES.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 CIDR_EXAMPLES.md diff --git a/CIDR_EXAMPLES.md b/CIDR_EXAMPLES.md new file mode 100644 index 0000000..68b02b8 --- /dev/null +++ b/CIDR_EXAMPLES.md @@ -0,0 +1,38 @@ +# CIDR Range Scanning Examples + +This file demonstrates how to use CIDR range scanning in proxy-spider. + +## Basic CIDR Examples + +# Single IP (equivalent to 192.168.1.100:8080) +192.168.1.100/32:8080 + +# Small subnet (4 IPs: .0, .1, .2, .3) +192.168.1.0/30:3128 + +# Medium subnet (8 IPs: .0 through .7) +10.0.0.0/29:1080 + +# Larger subnet (16 IPs: .240 through .255) +172.16.1.240/28:8888 + +# Class C subnet (256 IPs: .0 through .255) +203.0.113.0/24:9090 + +## Mixed with Regular Entries + +# You can mix CIDR ranges with regular IP:port entries: +192.168.1.0/30:8080 +127.0.0.1:8888 +10.0.0.0/31:3128 +8.8.8.8:53 + +## Comments and Invalid Lines + +# Lines starting with # are treated as comments and ignored +# Invalid CIDR ranges are preserved as-is for the regular parser +invalid-cidr-range:1234 +not-an-ip:port + +# Different protocols can use the same format +# Just put them in the appropriate protocol section in your config From b99db59106139c8abdec488c84dc76b3cd9c2aac Mon Sep 17 00:00:00 2001 From: KhulnaSoft bot <43526132+khulnasoft-bot@users.noreply.github.com> Date: Tue, 23 Sep 2025 08:13:40 +0600 Subject: [PATCH 5/7] Update config.toml --- config.toml | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/config.toml b/config.toml index 58dc6f5..c3661f0 100644 --- a/config.toml +++ b/config.toml @@ -17,7 +17,7 @@ connect_timeout = 5.0 proxy = "" # User-Agent header for scraping requests -user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36" +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36" [checking] @@ -42,7 +42,7 @@ timeout = 60.0 connect_timeout = 5.0 # User-Agent header for proxy check requests -user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36" +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36" [output] @@ -74,15 +74,31 @@ include_geolocation = true # Proxy sources configuration # Add local files: ["./my_proxies.txt"] or URLs # Sources are fetched in parallel for speed +# +# CIDR Range Support: +# You can now use CIDR notation to scan entire network ranges! +# Format: IP/prefix:port (e.g., "192.168.1.0/24:8080") +# This expands to all IPs in the range with the specified port. +# Examples: +# 192.168.1.0/30:8080 → 4 IPs (.0, .1, .2, .3) +# 10.0.0.0/24:3128 → 256 IPs (.0 through .255) +# 203.0.113.0/29:1080 → 8 IPs (.0 through .7) +# Mix CIDR ranges with individual IP:port entries in the same file. [scraping.http] enabled = true urls = [ # Local file examples: # "./my_http_proxies.txt", - # "/home/user/my_http_proxies.txt", + # "/home/user/my_http_proxies.txt", # "C:/Users/user/Desktop/my_http_proxies.txt", # "file:///home/user/my_http_proxies.txt", + + # CIDR range file example: + # File containing: 192.168.1.0/24:8080 + # 10.0.0.0/29:3128 + # 203.0.113.1:1080 + # "./my_cidr_ranges.txt", # Advanced URL configuration examples (with basic auth or custom headers): # HTTP Basic Auth example: From 5e11af1041921296c31ad8da4cfb7739eac3c280 Mon Sep 17 00:00:00 2001 From: KhulnaSoft bot <43526132+khulnasoft-bot@users.noreply.github.com> Date: Tue, 23 Sep 2025 02:35:30 +0000 Subject: [PATCH 6/7] Add CIDR range expansion support for proxy scanning --- CIDR_EXAMPLES.md | 11 ++++++++++- Cargo.lock | 1 + Cargo.toml | 1 + config.toml | 6 +++--- out/.gitkeep | 1 - src/parsers.rs | 42 +++++++++++++++++++++--------------------- src/scraper.rs | 5 +++-- 7 files changed, 39 insertions(+), 28 deletions(-) diff --git a/CIDR_EXAMPLES.md b/CIDR_EXAMPLES.md index 68b02b8..fddfde3 100644 --- a/CIDR_EXAMPLES.md +++ b/CIDR_EXAMPLES.md @@ -5,23 +5,29 @@ This file demonstrates how to use CIDR range scanning in proxy-spider. ## Basic CIDR Examples # Single IP (equivalent to 192.168.1.100:8080) + 192.168.1.100/32:8080 -# Small subnet (4 IPs: .0, .1, .2, .3) +# Small subnet (4 IPs: .0, .1, .2, .3) + 192.168.1.0/30:3128 # Medium subnet (8 IPs: .0 through .7) + 10.0.0.0/29:1080 # Larger subnet (16 IPs: .240 through .255) + 172.16.1.240/28:8888 # Class C subnet (256 IPs: .0 through .255) + 203.0.113.0/24:9090 ## Mixed with Regular Entries # You can mix CIDR ranges with regular IP:port entries: + 192.168.1.0/30:8080 127.0.0.1:8888 10.0.0.0/31:3128 @@ -30,9 +36,12 @@ This file demonstrates how to use CIDR range scanning in proxy-spider. ## Comments and Invalid Lines # Lines starting with # are treated as comments and ignored + # Invalid CIDR ranges are preserved as-is for the regular parser + invalid-cidr-range:1234 not-an-ip:port # Different protocols can use the same format + # Just put them in the appropriate protocol section in your config diff --git a/Cargo.lock b/Cargo.lock index ec63c32..82711e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1499,6 +1499,7 @@ dependencies = [ "hickory-resolver", "http", "httpdate", + "ipnetwork", "itertools 0.14.0", "log", "maxminddb", diff --git a/Cargo.toml b/Cargo.toml index 15ae250..6de32f5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ futures = { version = "=0.3.31", optional = true } hickory-resolver = "=0.25.2" http = "=1.3.1" httpdate = "=1.0.3" +ipnetwork = "=0.21.1" itertools = "=0.14" log = { version = "=0.4.28", features = [ "max_level_debug", diff --git a/config.toml b/config.toml index c3661f0..f220f7f 100644 --- a/config.toml +++ b/config.toml @@ -81,7 +81,7 @@ include_geolocation = true # This expands to all IPs in the range with the specified port. # Examples: # 192.168.1.0/30:8080 → 4 IPs (.0, .1, .2, .3) -# 10.0.0.0/24:3128 → 256 IPs (.0 through .255) +# 10.0.0.0/24:3128 → 256 IPs (.0 through .255) # 203.0.113.0/29:1080 → 8 IPs (.0 through .7) # Mix CIDR ranges with individual IP:port entries in the same file. @@ -90,10 +90,10 @@ enabled = true urls = [ # Local file examples: # "./my_http_proxies.txt", - # "/home/user/my_http_proxies.txt", + # "/home/user/my_http_proxies.txt", # "C:/Users/user/Desktop/my_http_proxies.txt", # "file:///home/user/my_http_proxies.txt", - + # CIDR range file example: # File containing: 192.168.1.0/24:8080 # 10.0.0.0/29:3128 diff --git a/out/.gitkeep b/out/.gitkeep index 8b13789..e69de29 100644 --- a/out/.gitkeep +++ b/out/.gitkeep @@ -1 +0,0 @@ - diff --git a/src/parsers.rs b/src/parsers.rs index e757c65..4089517 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -34,20 +34,20 @@ pub fn parse_ipv4(s: &str) -> Option { pub fn expand_cidr_ranges(text: &str) -> String { let mut result = text.to_string(); let mut offset: i32 = 0; - + // Find all CIDR matches and expand them let captures: Vec<_> = CIDR_REGEX.captures_iter(text) .filter_map(|m| m.ok()) .collect(); - + for capture in captures { if let (Some(network), Some(prefix), Some(port)) = ( - capture.name("network"), - capture.name("prefix"), + capture.name("network"), + capture.name("prefix"), capture.name("port") ) { let cidr_str = format!("{}/{}", network.as_str(), prefix.as_str()); - + match cidr_str.parse::() { Ok(network) => { // Generate expanded IPs @@ -55,29 +55,29 @@ pub fn expand_cidr_ranges(text: &str) -> String { .filter(|ip| ip.is_ipv4()) .map(|ip| format!("{}:{}", ip, port.as_str())) .collect(); - + if !expanded_ips.is_empty() { // Get the full match including any leading non-alphanumeric character let full_match = capture.get(0).unwrap(); let match_start = (full_match.start() as i32 + offset) as usize; let match_end = (full_match.end() as i32 + offset) as usize; - + // Determine what separator to use by checking what follows let separator = if match_end < result.len() { let next_char = result.chars().nth(match_end); match next_char { Some('\n') => "\n", - Some('\t') => "\t", + Some('\t') => "\t", Some(',') => ",", _ => " ", } } else { "\n" }; - + // Join expanded IPs with the detected separator let replacement = expanded_ips.join(separator); - + // Handle case where match starts with a delimiter character let (_actual_start, prefix_char) = if match_start > 0 { let prev_char = result.chars().nth(match_start); @@ -89,12 +89,12 @@ pub fn expand_cidr_ranges(text: &str) -> String { } else { (match_start, String::new()) }; - + let final_replacement = format!("{}{}", prefix_char, replacement); - + // Replace the CIDR pattern with expanded IPs result.replace_range(match_start..match_end, &final_replacement); - + // Update offset for subsequent replacements let len_diff = final_replacement.len() as i32 - (match_end - match_start) as i32; offset += len_diff; @@ -107,7 +107,7 @@ pub fn expand_cidr_ranges(text: &str) -> String { } } } - + result } @@ -121,7 +121,7 @@ mod tests { let input = "192.168.1.0/30:8080"; let result = expand_cidr_ranges(input); let lines: Vec<&str> = result.trim().split('\n').collect(); - + assert_eq!(lines.len(), 4); assert!(lines.contains(&"192.168.1.0:8080")); assert!(lines.contains(&"192.168.1.1:8080")); @@ -134,7 +134,7 @@ mod tests { let input = "192.168.1.0/31:8080\n127.0.0.1:9090\ninvalid-line"; let result = expand_cidr_ranges(input); let lines: Vec<&str> = result.trim().split('\n').collect(); - + // Should have 2 CIDR-expanded IPs + 1 regular IP + 1 invalid line assert_eq!(lines.len(), 4); assert!(lines.contains(&"192.168.1.0:8080")); @@ -155,19 +155,19 @@ mod tests { // Test space-separated entries with CIDR expansion let input = "192.168.1.0/31:8080 127.0.0.1:9090"; let result = expand_cidr_ranges(input); - + // Should expand the CIDR range and preserve the regular proxy assert!(result.contains("192.168.1.0:8080")); assert!(result.contains("192.168.1.1:8080")); assert!(result.contains("127.0.0.1:9090")); } - #[test] + #[test] fn test_multiple_cidr_same_line_behavior() { // Test multiple CIDR ranges on same line let input = "192.168.1.0/31:8080 10.0.0.0/31:3128"; let result = expand_cidr_ranges(input); - + // Should expand both CIDR ranges assert!(result.contains("192.168.1.0:8080")); assert!(result.contains("192.168.1.1:8080")); @@ -179,7 +179,7 @@ mod tests { fn test_comma_separated_cidr() { let input = "192.168.1.0/31:8080,10.0.0.0/31:3128"; let result = expand_cidr_ranges(input); - + // Should expand both CIDR ranges and preserve comma separation assert!(result.contains("192.168.1.0:8080")); assert!(result.contains("192.168.1.1:8080")); @@ -191,7 +191,7 @@ mod tests { fn test_mixed_separators() { let input = "192.168.1.0/31:8080\t10.0.0.1:3128,203.0.113.0/31:1080 127.0.0.1:9090"; let result = expand_cidr_ranges(input); - + // Should expand CIDR ranges and preserve non-CIDR entries assert!(result.contains("192.168.1.0:8080")); assert!(result.contains("192.168.1.1:8080")); diff --git a/src/scraper.rs b/src/scraper.rs index 7a7f42b..658899b 100644 --- a/src/scraper.rs +++ b/src/scraper.rs @@ -70,13 +70,14 @@ async fn scrape_one( // Expand CIDR ranges to individual IP:port entries let expanded_text = expand_cidr_ranges(&text); - + #[cfg(feature = "tui")] let mut seen_protocols = HashSet::new(); let mut new_proxies = HashSet::new(); - for (i, maybe_capture) in PROXY_REGEX.captures_iter(&expanded_text).enumerate() { + for (_, maybe_capture) in PROXY_REGEX.captures_iter(&expanded_text).enumerate() { + if config.scraping.max_proxies_per_source != 0 && new_proxies.len() >= config.scraping.max_proxies_per_source { From 93531826382073103a444e432115b60ed4d75103 Mon Sep 17 00:00:00 2001 From: KhulnaSoft bot <43526132+khulnasoft-bot@users.noreply.github.com> Date: Tue, 23 Sep 2025 02:39:23 +0000 Subject: [PATCH 7/7] Add CIDR range expansion support for proxy scanning --- src/parsers.rs | 40 ++++++++++++++++++++++++++++------------ src/scraper.rs | 5 +++-- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/src/parsers.rs b/src/parsers.rs index 4089517..6da3ebd 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -36,22 +36,22 @@ pub fn expand_cidr_ranges(text: &str) -> String { let mut offset: i32 = 0; // Find all CIDR matches and expand them - let captures: Vec<_> = CIDR_REGEX.captures_iter(text) - .filter_map(|m| m.ok()) - .collect(); + let captures: Vec<_> = + CIDR_REGEX.captures_iter(text).filter_map(|m| m.ok()).collect(); for capture in captures { if let (Some(network), Some(prefix), Some(port)) = ( capture.name("network"), capture.name("prefix"), - capture.name("port") + capture.name("port"), ) { let cidr_str = format!("{}/{}", network.as_str(), prefix.as_str()); match cidr_str.parse::() { Ok(network) => { // Generate expanded IPs - let expanded_ips: Vec = network.iter() + let expanded_ips: Vec = network + .iter() .filter(|ip| ip.is_ipv4()) .map(|ip| format!("{}:{}", ip, port.as_str())) .collect(); @@ -59,8 +59,10 @@ pub fn expand_cidr_ranges(text: &str) -> String { if !expanded_ips.is_empty() { // Get the full match including any leading non-alphanumeric character let full_match = capture.get(0).unwrap(); - let match_start = (full_match.start() as i32 + offset) as usize; - let match_end = (full_match.end() as i32 + offset) as usize; + let match_start = + (full_match.start() as i32 + offset) as usize; + let match_end = + (full_match.end() as i32 + offset) as usize; // Determine what separator to use by checking what follows let separator = if match_end < result.len() { @@ -81,8 +83,17 @@ pub fn expand_cidr_ranges(text: &str) -> String { // Handle case where match starts with a delimiter character let (_actual_start, prefix_char) = if match_start > 0 { let prev_char = result.chars().nth(match_start); - if prev_char.map_or(false, |c| !c.is_ascii_alphanumeric()) { - (match_start + 1, result.chars().nth(match_start).unwrap().to_string()) + if prev_char + .map_or(false, |c| !c.is_ascii_alphanumeric()) + { + ( + match_start + 1, + result + .chars() + .nth(match_start) + .unwrap() + .to_string(), + ) } else { (match_start, String::new()) } @@ -90,13 +101,18 @@ pub fn expand_cidr_ranges(text: &str) -> String { (match_start, String::new()) }; - let final_replacement = format!("{}{}", prefix_char, replacement); + let final_replacement = + format!("{}{}", prefix_char, replacement); // Replace the CIDR pattern with expanded IPs - result.replace_range(match_start..match_end, &final_replacement); + result.replace_range( + match_start..match_end, + &final_replacement, + ); // Update offset for subsequent replacements - let len_diff = final_replacement.len() as i32 - (match_end - match_start) as i32; + let len_diff = final_replacement.len() as i32 + - (match_end - match_start) as i32; offset += len_diff; } } diff --git a/src/scraper.rs b/src/scraper.rs index 658899b..5dbc5d2 100644 --- a/src/scraper.rs +++ b/src/scraper.rs @@ -76,8 +76,9 @@ async fn scrape_one( let mut new_proxies = HashSet::new(); - for (_, maybe_capture) in PROXY_REGEX.captures_iter(&expanded_text).enumerate() { - + for (_, maybe_capture) in + PROXY_REGEX.captures_iter(&expanded_text).enumerate() + { if config.scraping.max_proxies_per_source != 0 && new_proxies.len() >= config.scraping.max_proxies_per_source {