import re
import time
# VULNERABLE: Nested quantifiers in email validation
def validate_email_vulnerable(email):
# Problematic pattern with nested quantifiers
pattern = r'^([a-zA-Z0-9_.+-]+)+@([a-zA-Z0-9-]+)+\.([a-zA-Z0-9-.]+)+$'
return re.match(pattern, email) is not None
# VULNERABLE: Complex pattern for parsing user input
def parse_user_data_vulnerable(data):
# Catastrophic backtracking in complex parsing
patterns = {
'name': r'^([A-Za-z]+\s*)+$', # Nested quantifiers
'address': r'^([0-9A-Za-z\s,.-]+)*$', # Overlapping patterns
'comment': r'^(.|\n)*$' # Dangerous .* pattern
}
results = {}
for field, pattern in patterns.items():
if field in data:
results[field] = re.match(pattern, data[field]) is not None
else:
results[field] = False
return results
# VULNERABLE: Log parsing with complex regex
def parse_log_entries_vulnerable(log_content):
# Complex pattern prone to catastrophic backtracking
log_pattern = r'(\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2})\s+([A-Z]+)\s+(.+)\s*'
matches = []
for line in log_content.split('\n'):
match = re.search(log_pattern, line)
if match:
matches.append({
'timestamp': match.group(1),
'level': match.group(2),
'message': match.group(3)
})
return matches
# VULNERABLE: HTML tag extraction
def extract_html_tags_vulnerable(html):
# Nested quantifiers in HTML parsing
tag_pattern = r'<([a-zA-Z][a-zA-Z0-9]*)(\s+[a-zA-Z][a-zA-Z0-9]*\s*=\s*(["\'][^"\']*["\']|[^\s>]*))*\s*/?>'
tags = []
for match in re.finditer(tag_pattern, html):
tags.append(match.group(1))
return tags
# VULNERABLE: Complex URL validation
def validate_url_vulnerable(url):
# Multiple overlapping alternatives
url_pattern = r'^(https?://)?(www\.)?([a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}(/.*)?$|^(ftp://)?(ftp\.)?([a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}(/.*)?$'
return re.match(url_pattern, url) is not None
# Test with malicious inputs:
# validate_email_vulnerable("a" * 50 + "@gmail.com") # Exponential time
# parse_user_data_vulnerable({'name': "a " * 50 + "!"})
# extract_html_tags_vulnerable("
")
# validate_url_vulnerable("http://" + "a." * 25 + "!")
import re
import time
import signal
from contextlib import contextmanager
from urllib.parse import urlparse
from typing import Dict, List, Optional, Any
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class RegexTimeoutError(Exception):
"""Raised when regex operation times out"""
pass
@contextmanager
def regex_timeout(seconds):
"""Context manager to timeout regex operations"""
def timeout_handler(signum, frame):
raise RegexTimeoutError("Regex operation timed out")
# Set up the timeout
old_handler = signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(seconds)
try:
yield
finally:
signal.alarm(0)
signal.signal(signal.SIGALRM, old_handler)
class SafeRegexValidator:
"""Safe regex validation with timeout protection and simple patterns"""
# Safe patterns without nested quantifiers
PATTERNS = {
'email': r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$',
'name': r'^[A-Za-z\s]{1,100}$', # Simple pattern with length limit
'address': r'^[0-9A-Za-z\s,.#-]{1,200}$', # No nested quantifiers
'phone': r'^\+?[1-9]\d{6,14}$',
'url': r'^https?://[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?$',
'ipv4': r'^(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.(?:25[0-5]|2[0-4]\d|[01]?\d\d?)$',
'date': r'^\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$'
}
def __init__(self, max_input_length=1000, timeout_seconds=1):
self.max_input_length = max_input_length
self.timeout_seconds = timeout_seconds
self.compiled_patterns = {}
# Pre-compile patterns
for name, pattern in self.PATTERNS.items():
try:
self.compiled_patterns[name] = re.compile(pattern)
except re.error as e:
logger.error(f"Failed to compile pattern {name}: {e}")
def validate_input_length(self, text: str) -> bool:
"""Check if input length is within safe limits"""
return text is not None and len(text) <= self.max_input_length
def safe_regex_match(self, pattern_name: str, text: str) -> bool:
"""Safely execute regex with timeout protection"""
if not self.validate_input_length(text):
logger.warning(f"Input too long for pattern {pattern_name}: {len(text) if text else 0}")
return False
pattern = self.compiled_patterns.get(pattern_name)
if not pattern:
logger.error(f"Unknown pattern: {pattern_name}")
return False
try:
with regex_timeout(self.timeout_seconds):
start_time = time.time()
result = pattern.match(text) is not None
duration = time.time() - start_time
if duration > 0.1: # Log slow operations
logger.warning(f"Slow regex operation: {pattern_name} took {duration:.3f}s")
return result
except RegexTimeoutError:
logger.error(f"Regex timeout for pattern {pattern_name} with input length {len(text)}")
return False
except Exception as e:
logger.error(f"Regex error for pattern {pattern_name}: {e}")
return False
def validate_email(self, email: str) -> bool:
"""Safe email validation"""
return self.safe_regex_match('email', email)
def validate_url_safe(self, url: str) -> bool:
"""Safe URL validation using urlparse"""
if not self.validate_input_length(url):
return False
try:
parsed = urlparse(url)
return (parsed.scheme in ['http', 'https'] and
parsed.netloc and
len(parsed.netloc) <= 253 and
'.' in parsed.netloc)
except Exception:
return False
def parse_user_data_safe(self, data: Dict[str, Any]) -> Dict[str, bool]:
"""Safe user data parsing with individual field validation"""
results = {}
# Validate each field individually with appropriate patterns
field_patterns = {
'name': 'name',
'email': 'email',
'phone': 'phone',
'address': 'address'
}
for field, pattern_name in field_patterns.items():
if field in data:
value = str(data[field]) if data[field] is not None else ''
results[field] = self.safe_regex_match(pattern_name, value)
else:
results[field] = False
return results
def parse_log_entries_safe(self, log_content: str) -> List[Dict[str, str]]:
"""Safe log parsing with simple string operations"""
if not self.validate_input_length(log_content):
logger.warning("Log content too long for processing")
return []
matches = []
lines = log_content.split('\n')[:1000] # Limit number of lines
for line in lines:
if not line.strip():
continue
# Use simple string operations instead of complex regex
parts = line.split(None, 2) # Split on whitespace, max 3 parts
if len(parts) >= 3:
timestamp_part = parts[0] + ' ' + parts[1] if len(parts[0]) == 10 else parts[0]
level_part = parts[1] if len(parts[0]) == 10 else parts[1]
message_part = parts[2] if len(parts[0]) == 10 else ' '.join(parts[2:])
# Simple validation instead of complex regex
if self.safe_regex_match('date', timestamp_part.split()[0]):
matches.append({
'timestamp': timestamp_part,
'level': level_part,
'message': message_part[:500] # Limit message length
})
return matches
def extract_html_tags_safe(self, html: str) -> List[str]:
"""Safe HTML tag extraction using simple parsing"""
if not self.validate_input_length(html):
return []
tags = []
in_tag = False
current_tag = ''
# Simple state machine instead of complex regex
for i, char in enumerate(html):
if i > 10000: # Additional safety limit
break
if char == '<':
in_tag = True
current_tag = ''
elif char == '>' and in_tag:
in_tag = False
# Extract tag name (first word)
tag_name = current_tag.split()[0] if current_tag else ''
if tag_name and tag_name.isalpha():
tags.append(tag_name)
elif in_tag:
current_tag += char
# Prevent excessively long tags
if len(current_tag) > 100:
in_tag = False
current_tag = ''
return tags[:100] # Limit number of tags returned
# Performance testing utilities
def test_regex_performance(validator: SafeRegexValidator):
"""Test regex patterns for potential ReDoS vulnerabilities"""
test_cases = {
'email': [
'a' * 50 + '@gmail.com',
'test@' + 'a' * 50 + '.com',
'a@b.c' + 'a' * 50
],
'name': [
'a ' * 100,
'John ' * 50 + '!',
'A' * 200
],
'url': [
'http://' + 'a.' * 50 + 'com',
'https://example.com/' + 'a' * 100,
'http://sub.' * 25 + 'example.com'
]
}
for pattern_name, test_inputs in test_cases.items():
print(f"\nTesting pattern: {pattern_name}")
for test_input in test_inputs:
start_time = time.time()
try:
result = validator.safe_regex_match(pattern_name, test_input)
duration = time.time() - start_time
print(f" Input length {len(test_input)}: {duration:.4f}s - {'PASS' if result else 'FAIL'}")
if duration > 0.1:
print(f" WARNING: Slow operation detected!")
except Exception as e:
print(f" Input length {len(test_input)}: ERROR - {e}")
# Usage examples
def main():
validator = SafeRegexValidator(max_input_length=1000, timeout_seconds=1)
# Test email validation
test_emails = [
'user@example.com',
'a' * 50 + '@gmail.com', # Potential ReDoS input
'invalid-email'
]
for email in test_emails:
result = validator.validate_email(email)
print(f"Email '{email[:30]}...': {result}")
# Test URL validation
test_urls = [
'https://example.com',
'http://' + 'a.' * 50 + 'com', # Potential ReDoS input
'invalid-url'
]
for url in test_urls:
result = validator.validate_url_safe(url)
print(f"URL '{url[:30]}...': {result}")
# Performance testing
test_regex_performance(validator)
if __name__ == '__main__':
main()