diff --git a/CIDR_EXAMPLES.md b/CIDR_EXAMPLES.md new file mode 100644 index 0000000..fddfde3 --- /dev/null +++ b/CIDR_EXAMPLES.md @@ -0,0 +1,47 @@ +# CIDR Range Scanning Examples + +This file demonstrates how to use CIDR range scanning in proxy-spider. + +## Basic CIDR Examples + +# Single IP (equivalent to 192.168.1.100:8080) + +192.168.1.100/32:8080 + +# Small subnet (4 IPs: .0, .1, .2, .3) + +192.168.1.0/30:3128 + +# Medium subnet (8 IPs: .0 through .7) + +10.0.0.0/29:1080 + +# Larger subnet (16 IPs: .240 through .255) + +172.16.1.240/28:8888 + +# Class C subnet (256 IPs: .0 through .255) + +203.0.113.0/24:9090 + +## Mixed with Regular Entries + +# You can mix CIDR ranges with regular IP:port entries: + +192.168.1.0/30:8080 +127.0.0.1:8888 +10.0.0.0/31:3128 +8.8.8.8:53 + +## Comments and Invalid Lines + +# Lines starting with # are treated as comments and ignored + +# Invalid CIDR ranges are preserved as-is for the regular parser + +invalid-cidr-range:1234 +not-an-ip:port + +# Different protocols can use the same format + +# Just put them in the appropriate protocol section in your config diff --git a/Cargo.lock b/Cargo.lock index ec63c32..82711e2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1499,6 +1499,7 @@ dependencies = [ "hickory-resolver", "http", "httpdate", + "ipnetwork", "itertools 0.14.0", "log", "maxminddb", diff --git a/Cargo.toml b/Cargo.toml index 15ae250..6de32f5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ futures = { version = "=0.3.31", optional = true } hickory-resolver = "=0.25.2" http = "=1.3.1" httpdate = "=1.0.3" +ipnetwork = "=0.21.1" itertools = "=0.14" log = { version = "=0.4.28", features = [ "max_level_debug", diff --git a/config.toml b/config.toml index 58dc6f5..f220f7f 100644 --- a/config.toml +++ b/config.toml @@ -17,7 +17,7 @@ connect_timeout = 5.0 proxy = "" # User-Agent header for scraping requests -user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36" +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36" [checking] @@ -42,7 +42,7 @@ timeout = 60.0 connect_timeout = 5.0 # User-Agent header for proxy check requests -user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36" +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36" [output] @@ -74,6 +74,16 @@ include_geolocation = true # Proxy sources configuration # Add local files: ["./my_proxies.txt"] or URLs # Sources are fetched in parallel for speed +# +# CIDR Range Support: +# You can now use CIDR notation to scan entire network ranges! +# Format: IP/prefix:port (e.g., "192.168.1.0/24:8080") +# This expands to all IPs in the range with the specified port. +# Examples: +# 192.168.1.0/30:8080 → 4 IPs (.0, .1, .2, .3) +# 10.0.0.0/24:3128 → 256 IPs (.0 through .255) +# 203.0.113.0/29:1080 → 8 IPs (.0 through .7) +# Mix CIDR ranges with individual IP:port entries in the same file. [scraping.http] enabled = true @@ -84,6 +94,12 @@ urls = [ # "C:/Users/user/Desktop/my_http_proxies.txt", # "file:///home/user/my_http_proxies.txt", + # CIDR range file example: + # File containing: 192.168.1.0/24:8080 + # 10.0.0.0/29:3128 + # 203.0.113.1:1080 + # "./my_cidr_ranges.txt", + # Advanced URL configuration examples (with basic auth or custom headers): # HTTP Basic Auth example: # { url = "https://some.api/endpoint", basic_auth = { username = "user", password = "password123" } }, diff --git a/out/.gitkeep b/out/.gitkeep index 8b13789..e69de29 100644 --- a/out/.gitkeep +++ b/out/.gitkeep @@ -1 +0,0 @@ - diff --git a/src/parsers.rs b/src/parsers.rs index 902e76f..6da3ebd 100644 --- a/src/parsers.rs +++ b/src/parsers.rs @@ -1,5 +1,7 @@ use std::sync::LazyLock; +use ipnetwork::IpNetwork; + pub static PROXY_REGEX: LazyLock = LazyLock::new(|| { let pattern = r"(?:^|[^0-9A-Za-z])(?:(?Phttps?|socks[45]):\/\/)?(?:(?P[0-9A-Za-z]{1,64}):(?P[0-9A-Za-z]{1,64})@)?(?P[A-Za-z][\-\.A-Za-z]{0,251}[A-Za-z]|[A-Za-z]|(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3}):(?P[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])(?=[^0-9A-Za-z]|$)"; fancy_regex::RegexBuilder::new(pattern) @@ -13,6 +15,11 @@ static IPV4_REGEX: LazyLock = LazyLock::new(|| { fancy_regex::Regex::new(pattern).unwrap() }); +static CIDR_REGEX: LazyLock = LazyLock::new(|| { + let pattern = r"(?:^|[^0-9A-Za-z])(?P(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])(?:\.(?:[0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])){3})/(?P[0-9]|[12][0-9]|3[0-2]):(?P[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])(?=[^0-9A-Za-z]|$)"; + fancy_regex::Regex::new(pattern).unwrap() +}); + pub fn parse_ipv4(s: &str) -> Option { if let Ok(Some(captures)) = IPV4_REGEX.captures(s) { captures.name("host").map(|capture| capture.as_str().to_owned()) @@ -20,3 +27,193 @@ pub fn parse_ipv4(s: &str) -> Option { None } } + +/// Expands CIDR ranges in text into individual IP:port entries +/// Supports format like "192.168.1.0/24:8080" which expands to all IPs in the range +/// Handles various separators (spaces, commas, newlines, etc.) between entries +pub fn expand_cidr_ranges(text: &str) -> String { + let mut result = text.to_string(); + let mut offset: i32 = 0; + + // Find all CIDR matches and expand them + let captures: Vec<_> = + CIDR_REGEX.captures_iter(text).filter_map(|m| m.ok()).collect(); + + for capture in captures { + if let (Some(network), Some(prefix), Some(port)) = ( + capture.name("network"), + capture.name("prefix"), + capture.name("port"), + ) { + let cidr_str = format!("{}/{}", network.as_str(), prefix.as_str()); + + match cidr_str.parse::() { + Ok(network) => { + // Generate expanded IPs + let expanded_ips: Vec = network + .iter() + .filter(|ip| ip.is_ipv4()) + .map(|ip| format!("{}:{}", ip, port.as_str())) + .collect(); + + if !expanded_ips.is_empty() { + // Get the full match including any leading non-alphanumeric character + let full_match = capture.get(0).unwrap(); + let match_start = + (full_match.start() as i32 + offset) as usize; + let match_end = + (full_match.end() as i32 + offset) as usize; + + // Determine what separator to use by checking what follows + let separator = if match_end < result.len() { + let next_char = result.chars().nth(match_end); + match next_char { + Some('\n') => "\n", + Some('\t') => "\t", + Some(',') => ",", + _ => " ", + } + } else { + "\n" + }; + + // Join expanded IPs with the detected separator + let replacement = expanded_ips.join(separator); + + // Handle case where match starts with a delimiter character + let (_actual_start, prefix_char) = if match_start > 0 { + let prev_char = result.chars().nth(match_start); + if prev_char + .map_or(false, |c| !c.is_ascii_alphanumeric()) + { + ( + match_start + 1, + result + .chars() + .nth(match_start) + .unwrap() + .to_string(), + ) + } else { + (match_start, String::new()) + } + } else { + (match_start, String::new()) + }; + + let final_replacement = + format!("{}{}", prefix_char, replacement); + + // Replace the CIDR pattern with expanded IPs + result.replace_range( + match_start..match_end, + &final_replacement, + ); + + // Update offset for subsequent replacements + let len_diff = final_replacement.len() as i32 + - (match_end - match_start) as i32; + offset += len_diff; + } + } + Err(_) => { + // If parsing fails, leave the original text unchanged + continue; + } + } + } + } + + result +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_cidr_expansion() { + // Test basic CIDR expansion + let input = "192.168.1.0/30:8080"; + let result = expand_cidr_ranges(input); + let lines: Vec<&str> = result.trim().split('\n').collect(); + + assert_eq!(lines.len(), 4); + assert!(lines.contains(&"192.168.1.0:8080")); + assert!(lines.contains(&"192.168.1.1:8080")); + assert!(lines.contains(&"192.168.1.2:8080")); + assert!(lines.contains(&"192.168.1.3:8080")); + } + + #[test] + fn test_mixed_input() { + let input = "192.168.1.0/31:8080\n127.0.0.1:9090\ninvalid-line"; + let result = expand_cidr_ranges(input); + let lines: Vec<&str> = result.trim().split('\n').collect(); + + // Should have 2 CIDR-expanded IPs + 1 regular IP + 1 invalid line + assert_eq!(lines.len(), 4); + assert!(lines.contains(&"192.168.1.0:8080")); + assert!(lines.contains(&"192.168.1.1:8080")); + assert!(lines.contains(&"127.0.0.1:9090")); + assert!(lines.contains(&"invalid-line")); + } + + #[test] + fn test_single_ip_cidr() { + let input = "10.0.0.1/32:3128"; + let result = expand_cidr_ranges(input); + assert_eq!(result.trim(), "10.0.0.1:3128"); + } + + #[test] + fn test_non_newline_separated_behavior() { + // Test space-separated entries with CIDR expansion + let input = "192.168.1.0/31:8080 127.0.0.1:9090"; + let result = expand_cidr_ranges(input); + + // Should expand the CIDR range and preserve the regular proxy + assert!(result.contains("192.168.1.0:8080")); + assert!(result.contains("192.168.1.1:8080")); + assert!(result.contains("127.0.0.1:9090")); + } + + #[test] + fn test_multiple_cidr_same_line_behavior() { + // Test multiple CIDR ranges on same line + let input = "192.168.1.0/31:8080 10.0.0.0/31:3128"; + let result = expand_cidr_ranges(input); + + // Should expand both CIDR ranges + assert!(result.contains("192.168.1.0:8080")); + assert!(result.contains("192.168.1.1:8080")); + assert!(result.contains("10.0.0.0:3128")); + assert!(result.contains("10.0.0.1:3128")); + } + + #[test] + fn test_comma_separated_cidr() { + let input = "192.168.1.0/31:8080,10.0.0.0/31:3128"; + let result = expand_cidr_ranges(input); + + // Should expand both CIDR ranges and preserve comma separation + assert!(result.contains("192.168.1.0:8080")); + assert!(result.contains("192.168.1.1:8080")); + assert!(result.contains("10.0.0.0:3128")); + assert!(result.contains("10.0.0.1:3128")); + } + + #[test] + fn test_mixed_separators() { + let input = "192.168.1.0/31:8080\t10.0.0.1:3128,203.0.113.0/31:1080 127.0.0.1:9090"; + let result = expand_cidr_ranges(input); + + // Should expand CIDR ranges and preserve non-CIDR entries + assert!(result.contains("192.168.1.0:8080")); + assert!(result.contains("192.168.1.1:8080")); + assert!(result.contains("10.0.0.1:3128")); + assert!(result.contains("203.0.113.0:1080")); + assert!(result.contains("203.0.113.1:1080")); + assert!(result.contains("127.0.0.1:9090")); + } +} diff --git a/src/scraper.rs b/src/scraper.rs index 11a631d..5dbc5d2 100644 --- a/src/scraper.rs +++ b/src/scraper.rs @@ -8,7 +8,7 @@ use crate::event::{AppEvent, Event}; use crate::{ HashSet, config::{Config, Source}, - parsers::PROXY_REGEX, + parsers::{PROXY_REGEX, expand_cidr_ranges}, proxy::{Proxy, ProxyType}, utils::pretty_error, }; @@ -68,12 +68,17 @@ async fn scrape_one( } }; + // Expand CIDR ranges to individual IP:port entries + let expanded_text = expand_cidr_ranges(&text); + #[cfg(feature = "tui")] let mut seen_protocols = HashSet::new(); let mut new_proxies = HashSet::new(); - for maybe_capture in PROXY_REGEX.captures_iter(&text) { + for (_, maybe_capture) in + PROXY_REGEX.captures_iter(&expanded_text).enumerate() + { if config.scraping.max_proxies_per_source != 0 && new_proxies.len() >= config.scraping.max_proxies_per_source { @@ -121,7 +126,7 @@ async fn scrape_one( } drop(config); - drop(text); + drop(expanded_text); if new_proxies.is_empty() { tracing::warn!("{}: no proxies found", source.url); diff --git a/test_config.toml b/test_config.toml new file mode 100644 index 0000000..68e4f0f --- /dev/null +++ b/test_config.toml @@ -0,0 +1,144 @@ +# Enable debug logging (shows detailed checking process) +# Warning: Produces very verbose output +debug = false + + +[scraping] +# Maximum proxies to collect per source (0 = unlimited) +# Helps skip unreliable sources with too many proxies +max_proxies_per_source = 100000 + +# Request timeout for fetching proxy sources (seconds) +# Higher values may find more sources but take longer +timeout = 60.0 +connect_timeout = 5.0 + +# HTTP(S),SOCKS4 or SOCKS5 proxy used for fetching sources (e.g., "socks5://user:pass@host:port"). Leave empty to disable. +proxy = "" + +# User-Agent header for scraping requests +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36" + + +[checking] +# URL for checking proxy functionality +# httpbin-compatible: Returns JSON with IP info for ASN/geo data +# plain-text: Returns just IP address for basic connectivity +# Examples: +# "https://httpbin.org/ip" - JSON with "origin" key. Full featured checking. +# "https://api.ipify.org" - Simple IP return. Full featured checking. +# "https://google.com" - Basic connect/read check +# "" - Skip checking (scrape only) +check_url = "https://api.ipify.org" + +# Concurrent proxy checks (adjust based on RAM/network capacity) +# Start low and increase gradually if system handles it well +max_concurrent_checks = 1024 + +# Proxy response timeout (seconds) +# Higher = more working proxies found, slower checking +# Lower = faster checking, may miss slower proxies +timeout = 60.0 +connect_timeout = 5.0 + +# User-Agent header for proxy check requests +user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36" + + +[output] +# Output directory (Docker ignores this setting) +path = "./out" + +# Sort by response time (true) or IP address (false) +sort_by_speed = true + + +# Plain text output (.txt files) +[output.txt] +enabled = false + + +# JSON output with metadata (.json files) +[output.json] +enabled = false + +# Add ASN (network provider) info to JSON output +# Uses offline MaxMind database +include_asn = true + +# Add geolocation (country/city) info to JSON output +# Uses offline MaxMind database +include_geolocation = true + + +# Proxy sources configuration +# Add local files: ["./my_proxies.txt"] or URLs +# Sources are fetched in parallel for speed + +enabled = true +urls = ["file:///tmp/test_cidr.txt"] + # Local file examples: + # "./my_http_proxies.txt", + # "/home/user/my_http_proxies.txt", + # "C:/Users/user/Desktop/my_http_proxies.txt", + # "file:///home/user/my_http_proxies.txt", + + # Advanced URL configuration examples (with basic auth or custom headers): + # HTTP Basic Auth example: + # { url = "https://some.api/endpoint", basic_auth = { username = "user", password = "password123" } }, + # Custom headers example: + # { url = "https://some.api/endpoint", headers = { Authorization = "Bearer YOUR_API_KEY" } }, + + "https://api.proxyscrape.com/v3/free-proxy-list/get?request=getproxies&protocol=http", + "https://api.proxyscrape.com/v3/free-proxy-list/get?request=getproxies&protocol=https", + "https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/http/data.txt", + "https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/https/data.txt", + "https://raw.githubusercontent.com/roosterkid/openproxylist/refs/heads/main/HTTPS_RAW.txt", + "https://raw.githubusercontent.com/sunny9577/proxy-scraper/refs/heads/master/generated/http_proxies.txt", + "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/refs/heads/master/http.txt", +] + +[scraping.socks4] +enabled = false +urls = [ + # Local file examples: + # "./my_socks4_proxies.txt", + # "/home/user/my_socks4_proxies.txt", + # "C:/Users/user/Desktop/my_socks4_proxies.txt", + # "file:///home/user/my_socks4_proxies.txt", + + # Advanced URL configuration examples (with basic auth or custom headers): + # HTTP Basic Auth example: + # { url = "https://some.api/endpoint", basic_auth = { username = "user", password = "password123" } }, + # Custom headers example: + # { url = "https://some.api/endpoint", headers = { Authorization = "Bearer YOUR_API_KEY" } }, + + "https://api.proxyscrape.com/v3/free-proxy-list/get?request=getproxies&protocol=socks4", + "https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/socks4/data.txt", + "https://raw.githubusercontent.com/roosterkid/openproxylist/refs/heads/main/SOCKS4_RAW.txt", + "https://raw.githubusercontent.com/sunny9577/proxy-scraper/refs/heads/master/generated/socks4_proxies.txt", + "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/refs/heads/master/socks4.txt", +] + +[scraping.socks5] +enabled = false +urls = [ + # Local file examples: + # "./my_socks5_proxies.txt", + # "/home/user/my_socks5_proxies.txt", + # "C:/Users/user/Desktop/my_socks5_proxies.txt", + # "file:///home/user/my_socks5_proxies.txt", + + # Advanced URL configuration examples (with basic auth or custom headers): + # HTTP Basic Auth example: + # { url = "https://some.api/endpoint", basic_auth = { username = "user", password = "password123" } }, + # Custom headers example: + # { url = "https://some.api/endpoint", headers = { Authorization = "Bearer YOUR_API_KEY" } }, + + "https://api.proxyscrape.com/v3/free-proxy-list/get?request=getproxies&protocol=socks5", + "https://raw.githubusercontent.com/hookzof/socks5_list/refs/heads/master/proxy.txt", + "https://raw.githubusercontent.com/proxifly/free-proxy-list/refs/heads/main/proxies/protocols/socks5/data.txt", + "https://raw.githubusercontent.com/roosterkid/openproxylist/refs/heads/main/SOCKS5_RAW.txt", + "https://raw.githubusercontent.com/sunny9577/proxy-scraper/refs/heads/master/generated/socks5_proxies.txt", + "https://raw.githubusercontent.com/TheSpeedX/PROXY-List/refs/heads/master/socks5.txt", +]