scan performance
This commit is contained in:
15
.env.example
15
.env.example
@@ -35,9 +35,6 @@ SCAN_TIME=02:00
|
|||||||
SCAN_ON_STARTUP=true
|
SCAN_ON_STARTUP=true
|
||||||
CACHE_MAX_AGE_HOURS=168
|
CACHE_MAX_AGE_HOURS=168
|
||||||
|
|
||||||
# Parallel scanning
|
|
||||||
PARALLEL_WORKERS=8 # 0=auto
|
|
||||||
|
|
||||||
# Redis
|
# Redis
|
||||||
REDIS_HOST=redis
|
REDIS_HOST=redis
|
||||||
REDIS_PORT=6379
|
REDIS_PORT=6379
|
||||||
@@ -50,3 +47,15 @@ REDIS_CACHE_TTL=86400
|
|||||||
PRECACHE_INTERVAL_HOURS=168
|
PRECACHE_INTERVAL_HOURS=168
|
||||||
PRECACHE_CHECK_INTERVAL=3600
|
PRECACHE_CHECK_INTERVAL=3600
|
||||||
PRECACHE_MIN_TTL_HOURS=7
|
PRECACHE_MIN_TTL_HOURS=7
|
||||||
|
|
||||||
|
# MaxMind chunking
|
||||||
|
MAXMIND_CHUNK_TASKS_PER_WORKER=16
|
||||||
|
MAXMIND_CHUNK_MIN=200
|
||||||
|
MAXMIND_CHUNK_MAX=4000
|
||||||
|
|
||||||
|
# cap MaxMind workers per-country
|
||||||
|
MAXMIND_WORKERS_MAX=48
|
||||||
|
MAXMIND_WORKERS_MIN=6
|
||||||
|
|
||||||
|
# Parallel scanning
|
||||||
|
PARALLEL_WORKERS=8 # 0=auto
|
||||||
15
config.py
15
config.py
@@ -234,10 +234,17 @@ for country in COMMON_COUNTRIES:
|
|||||||
|
|
||||||
PRECACHE_APP_TYPES = [
|
PRECACHE_APP_TYPES = [
|
||||||
'nginx_geo',
|
'nginx_geo',
|
||||||
|
'nginx_map',
|
||||||
'nginx_deny',
|
'nginx_deny',
|
||||||
'apache_24',
|
'apache_24',
|
||||||
|
'apache_22',
|
||||||
'haproxy_acl',
|
'haproxy_acl',
|
||||||
|
'haproxy_lua',
|
||||||
|
'haproxy_map',
|
||||||
'raw-cidr_txt',
|
'raw-cidr_txt',
|
||||||
|
'raw-newline_txt',
|
||||||
|
'raw-json',
|
||||||
|
'raw-csv',
|
||||||
]
|
]
|
||||||
|
|
||||||
PRECACHE_AGGREGATE_VARIANTS = [True]
|
PRECACHE_AGGREGATE_VARIANTS = [True]
|
||||||
@@ -257,3 +264,11 @@ CACHE_MAX_AGE_HOURS = 168
|
|||||||
PRECACHE_INTERVAL_HOURS = int(os.getenv('PRECACHE_INTERVAL_HOURS', 168))
|
PRECACHE_INTERVAL_HOURS = int(os.getenv('PRECACHE_INTERVAL_HOURS', 168))
|
||||||
PRECACHE_CHECK_INTERVAL = int(os.getenv('PRECACHE_CHECK_INTERVAL', 3600))
|
PRECACHE_CHECK_INTERVAL = int(os.getenv('PRECACHE_CHECK_INTERVAL', 3600))
|
||||||
PRECACHE_MIN_TTL_HOURS = int(os.getenv('PRECACHE_MIN_TTL_HOURS', 7))
|
PRECACHE_MIN_TTL_HOURS = int(os.getenv('PRECACHE_MIN_TTL_HOURS', 7))
|
||||||
|
|
||||||
|
# MaxMind scan chunking
|
||||||
|
MAXMIND_CHUNK_TASKS_PER_WORKER = int(os.getenv('MAXMIND_CHUNK_TASKS_PER_WORKER', '16'))
|
||||||
|
MAXMIND_CHUNK_MIN = int(os.getenv('MAXMIND_CHUNK_MIN', '200'))
|
||||||
|
MAXMIND_CHUNK_MAX = int(os.getenv('MAXMIND_CHUNK_MAX', '4000'))
|
||||||
|
|
||||||
|
MAXMIND_WORKERS_MIN = int(os.getenv('MAXMIND_WORKERS_MIN', '6'))
|
||||||
|
MAXMIND_WORKERS_MAX = int(os.getenv('MAXMIND_WORKERS_MAX', '48'))
|
||||||
119
geoip_handler.py
119
geoip_handler.py
@@ -13,7 +13,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||||||
import threading
|
import threading
|
||||||
import config
|
import config
|
||||||
import ipaddress
|
import ipaddress
|
||||||
|
import math
|
||||||
|
from multiprocessing import cpu_count
|
||||||
|
|
||||||
def generate_metadata(countries: list, country_data: dict, redis_stats: dict = None, handler: 'GeoIPHandler' = None) -> dict:
|
def generate_metadata(countries: list, country_data: dict, redis_stats: dict = None, handler: 'GeoIPHandler' = None) -> dict:
|
||||||
"""
|
"""
|
||||||
@@ -568,70 +569,100 @@ class GeoIPHandler:
|
|||||||
|
|
||||||
return scan_ranges
|
return scan_ranges
|
||||||
|
|
||||||
def _scan_maxmind_for_country(self, country_code: str, progress_callback=None) -> list:
|
def _scan_maxmind_for_country(self, country_code: str, progress_callback=None, workers=None) -> list:
|
||||||
if not self.mmdb_file.exists():
|
if not self.mmdb_file.exists():
|
||||||
return []
|
return []
|
||||||
|
|
||||||
country_code = country_code.upper()
|
country_code = country_code.upper()
|
||||||
|
|
||||||
|
scan_ranges = self._get_scan_ranges()
|
||||||
|
total_ranges = len(scan_ranges)
|
||||||
|
|
||||||
|
# workers default
|
||||||
|
if workers is None or int(workers) <= 0:
|
||||||
|
workers = min(32, max(4, cpu_count() * 2))
|
||||||
|
else:
|
||||||
|
workers = int(workers)
|
||||||
|
|
||||||
|
tasks_per_worker = getattr(config, "MAXMIND_CHUNK_TASKS_PER_WORKER", 12)
|
||||||
|
chunk_min = getattr(config, "MAXMIND_CHUNK_MIN", 50)
|
||||||
|
chunk_max = getattr(config, "MAXMIND_CHUNK_MAX", 2000)
|
||||||
|
|
||||||
|
target_tasks = max(workers * int(tasks_per_worker), workers)
|
||||||
|
chunk = int(math.ceil(total_ranges / float(target_tasks)))
|
||||||
|
CHUNK = max(int(chunk_min), min(int(chunk_max), chunk))
|
||||||
|
|
||||||
|
if progress_callback:
|
||||||
|
progress_callback(f"Starting parallel MaxMind scan with {workers} workers...")
|
||||||
|
progress_callback(f"Scanning {total_ranges} IP ranges...")
|
||||||
|
progress_callback(f"Chunking: {CHUNK} ranges/task (~{int(math.ceil(total_ranges/float(CHUNK)))} tasks)")
|
||||||
|
|
||||||
found_networks = set()
|
found_networks = set()
|
||||||
found_networks_lock = threading.Lock()
|
found_networks_lock = threading.Lock()
|
||||||
|
|
||||||
try:
|
completed = 0
|
||||||
if progress_callback:
|
completed_lock = threading.Lock()
|
||||||
progress_callback(f"Starting parallel MaxMind scan with 32 workers...")
|
|
||||||
|
|
||||||
scan_ranges = self._get_scan_ranges()
|
tls = threading.local()
|
||||||
total_ranges = len(scan_ranges)
|
|
||||||
|
|
||||||
if progress_callback:
|
def get_reader():
|
||||||
progress_callback(f"Scanning {total_ranges} IP ranges...")
|
r = getattr(tls, "reader", None)
|
||||||
|
if r is None:
|
||||||
|
tls.reader = geoip2.database.Reader(str(self.mmdb_file))
|
||||||
|
return tls.reader
|
||||||
|
|
||||||
completed = 0
|
def scan_one_range(reader, network_str: str):
|
||||||
completed_lock = threading.Lock()
|
local = set()
|
||||||
|
try:
|
||||||
|
network = ipaddress.IPv4Network(network_str, strict=False)
|
||||||
|
for subnet in network.subnets(new_prefix=24):
|
||||||
|
sample_ip = str(subnet.network_address + 1)
|
||||||
|
try:
|
||||||
|
resp = reader.country(sample_ip)
|
||||||
|
if resp.country.iso_code == country_code:
|
||||||
|
local.add(subnet) # mniej alokacji niż str() w pętli
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return local
|
||||||
|
|
||||||
def scan_range(network_str):
|
def scan_chunk(ranges):
|
||||||
nonlocal completed
|
nonlocal completed
|
||||||
|
reader = get_reader()
|
||||||
|
local_chunk = set()
|
||||||
|
|
||||||
reader = geoip2.database.Reader(str(self.mmdb_file))
|
for r in ranges:
|
||||||
local_networks = set()
|
local_chunk.update(scan_one_range(reader, r))
|
||||||
|
|
||||||
try:
|
|
||||||
network = ipaddress.IPv4Network(network_str, strict=False)
|
|
||||||
|
|
||||||
for subnet in network.subnets(new_prefix=24):
|
|
||||||
sample_ip = str(subnet.network_address + 1)
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = reader.country(sample_ip)
|
|
||||||
if response.country.iso_code == country_code:
|
|
||||||
local_networks.add(str(subnet))
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
pass
|
|
||||||
finally:
|
|
||||||
reader.close()
|
|
||||||
|
|
||||||
with completed_lock:
|
with completed_lock:
|
||||||
completed += 1
|
completed += 1
|
||||||
if completed % 2000 == 0 and progress_callback:
|
c = completed
|
||||||
with found_networks_lock:
|
|
||||||
progress_pct = (completed / total_ranges) * 100
|
|
||||||
progress_callback(f"Scanning: {completed}/{total_ranges} ranges ({progress_pct:.1f}%), found {len(found_networks)} networks")
|
|
||||||
|
|
||||||
return local_networks
|
# progres częściej (diagnostyka), nie wpływa na wynik
|
||||||
|
if progress_callback and (c % 500 == 0 or c == total_ranges):
|
||||||
|
with found_networks_lock:
|
||||||
|
found_cnt = len(found_networks)
|
||||||
|
pct = (c / float(total_ranges)) * 100.0
|
||||||
|
progress_callback(
|
||||||
|
f"Scanning: {c}/{total_ranges} ranges ({pct:.1f}%), found {found_cnt} networks"
|
||||||
|
)
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=32) as executor:
|
return local_chunk
|
||||||
futures = {executor.submit(scan_range, r): r for r in scan_ranges}
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
chunks = [scan_ranges[i:i + CHUNK] for i in range(0, total_ranges, CHUNK)]
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||||
|
futures = [executor.submit(scan_chunk, ch) for ch in chunks]
|
||||||
for future in as_completed(futures):
|
for future in as_completed(futures):
|
||||||
local_nets = future.result()
|
local_nets = future.result()
|
||||||
|
if local_nets:
|
||||||
|
with found_networks_lock:
|
||||||
|
found_networks.update(local_nets)
|
||||||
|
|
||||||
with found_networks_lock:
|
# konwersja na string na końcu (wynik ten sam co wcześniej)
|
||||||
found_networks.update(local_nets)
|
result = [str(n) for n in found_networks]
|
||||||
|
|
||||||
result = list(found_networks)
|
|
||||||
|
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
progress_callback(f"MaxMind scan complete: {len(result)} networks")
|
progress_callback(f"MaxMind scan complete: {len(result)} networks")
|
||||||
|
|||||||
36
scheduler.py
36
scheduler.py
@@ -14,7 +14,7 @@ from pathlib import Path
|
|||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from multiprocessing import cpu_count
|
from multiprocessing import cpu_count
|
||||||
import threading
|
import threading
|
||||||
|
import traceback
|
||||||
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent))
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
@@ -29,6 +29,21 @@ write_lock = threading.Lock()
|
|||||||
active_scans = {}
|
active_scans = {}
|
||||||
active_scans_lock = threading.Lock()
|
active_scans_lock = threading.Lock()
|
||||||
|
|
||||||
|
def heartbeat():
|
||||||
|
log_safe(f"[{datetime.now()}] HEARTBEAT running=True next_run={schedule.next_run()} jobs={len(schedule.jobs)}")
|
||||||
|
|
||||||
|
def compute_maxmind_workers():
|
||||||
|
with active_scans_lock:
|
||||||
|
active = max(1, len(active_scans))
|
||||||
|
|
||||||
|
cpu = cpu_count()
|
||||||
|
total_budget = max(32, cpu * 6) # 16*6 = 96
|
||||||
|
per_country = max(4, total_budget // active)
|
||||||
|
|
||||||
|
min_w = int(os.getenv('MAXMIND_WORKERS_MIN', '6'))
|
||||||
|
max_w = int(os.getenv('MAXMIND_WORKERS_MAX', '48'))
|
||||||
|
|
||||||
|
return max(min_w, min(max_w, per_country))
|
||||||
|
|
||||||
def signal_handler(signum, frame):
|
def signal_handler(signum, frame):
|
||||||
global running
|
global running
|
||||||
@@ -96,7 +111,14 @@ def scan_single_country(country_code, is_update=False):
|
|||||||
|
|
||||||
print(f"[{country_code}] Scanning MaxMind + GitHub...", flush=True)
|
print(f"[{country_code}] Scanning MaxMind + GitHub...", flush=True)
|
||||||
|
|
||||||
maxmind_networks = handler._scan_maxmind_for_country(country_code, progress_callback=progress_cb)
|
maxmind_workers = compute_maxmind_workers()
|
||||||
|
print(f"[{country_code}] MaxMind workers: {maxmind_workers} (active scans: {len(active_scans)})", flush=True)
|
||||||
|
|
||||||
|
maxmind_networks = handler._scan_maxmind_for_country(
|
||||||
|
country_code,
|
||||||
|
progress_callback=progress_cb,
|
||||||
|
workers=maxmind_workers
|
||||||
|
)
|
||||||
|
|
||||||
if maxmind_networks:
|
if maxmind_networks:
|
||||||
print(f"[{country_code}] MaxMind: {len(maxmind_networks):,} networks, checking GitHub...", flush=True)
|
print(f"[{country_code}] MaxMind: {len(maxmind_networks):,} networks, checking GitHub...", flush=True)
|
||||||
@@ -386,8 +408,16 @@ if __name__ == '__main__':
|
|||||||
print("\nScheduler running. Press Ctrl+C to stop.\n", flush=True)
|
print("\nScheduler running. Press Ctrl+C to stop.\n", flush=True)
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
# heartbeat
|
||||||
|
schedule.every(1).minutes.do(heartbeat)
|
||||||
|
|
||||||
while running:
|
while running:
|
||||||
schedule.run_pending()
|
try:
|
||||||
|
schedule.run_pending()
|
||||||
|
except Exception as e:
|
||||||
|
log_safe(f"[{datetime.now()}] ERROR in run_pending: {e}")
|
||||||
|
traceback.print_exc()
|
||||||
|
sys.stdout.flush()
|
||||||
time.sleep(60)
|
time.sleep(60)
|
||||||
|
|
||||||
print("\n[SHUTDOWN] Stopped gracefully.", flush=True)
|
print("\n[SHUTDOWN] Stopped gracefully.", flush=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user