Copy # nlds/security/pii_protection.py
import re
import hashlib
from typing import Dict, List, Tuple, Any
class PIIProtectionManager:
def __init__(self):
self.pii_patterns = {
'email': r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
'phone': r'\b(?:\+?1[-.\s]?)?\(?([0-9]{3})\)?[-.\s]?([0-9]{3})[-.\s]?([0-9]{4})\b',
'ssn': r'\b\d{3}-?\d{2}-?\d{4}\b',
'credit_card': r'\b(?:\d{4}[-\s]?){3}\d{4}\b',
'ip_address': r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b',
'api_key': r'\b[A-Za-z0-9]{32,}\b',
'jwt_token': r'\beyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*\b'
}
self.masking_strategies = {
'email': self._mask_email,
'phone': self._mask_phone,
'ssn': self._mask_ssn,
'credit_card': self._mask_credit_card,
'ip_address': self._mask_ip,
'api_key': self._mask_api_key,
'jwt_token': self._mask_jwt_token
}
def detect_pii(self, text: str) -> List[Dict[str, Any]]:
"""Detect PII in text"""
detections = []
for pii_type, pattern in self.pii_patterns.items():
matches = re.finditer(pattern, text, re.IGNORECASE)
for match in matches:
detections.append({
'type': pii_type,
'value': match.group(),
'start': match.start(),
'end': match.end(),
'confidence': self._calculate_confidence(pii_type, match.group())
})
return detections
def mask_pii(self, text: str, mask_char: str = '*') -> Tuple[str, List[Dict[str, Any]]]:
"""Mask PII in text"""
masked_text = text
detections = []
offset = 0
for pii_type, pattern in self.pii_patterns.items():
matches = list(re.finditer(pattern, masked_text, re.IGNORECASE))
for match in reversed(matches): # Reverse to maintain positions
original_value = match.group()
masked_value = self.masking_strategies[pii_type](original_value, mask_char)
# Replace in text
start = match.start()
end = match.end()
masked_text = masked_text[:start] + masked_value + masked_text[end:]
detections.append({
'type': pii_type,
'original_length': len(original_value),
'masked_length': len(masked_value),
'position': start,
'hash': hashlib.sha256(original_value.encode()).hexdigest()[:16]
})
return masked_text, detections
def _mask_email(self, email: str, mask_char: str) -> str:
"""Mask email address"""
parts = email.split('@')
if len(parts) != 2:
return mask_char * len(email)
username, domain = parts
masked_username = username[0] + mask_char * (len(username) - 2) + username[-1] if len(username) > 2 else mask_char * len(username)
domain_parts = domain.split('.')
if len(domain_parts) >= 2:
masked_domain = mask_char * len(domain_parts[0]) + '.' + domain_parts[-1]
else:
masked_domain = mask_char * len(domain)
return f"{masked_username}@{masked_domain}"
def _mask_phone(self, phone: str, mask_char: str) -> str:
"""Mask phone number"""
digits_only = re.sub(r'\D', '', phone)
if len(digits_only) >= 10:
return phone[:3] + mask_char * (len(phone) - 6) + phone[-3:]
return mask_char * len(phone)
def _mask_ssn(self, ssn: str, mask_char: str) -> str:
"""Mask SSN"""
return mask_char * (len(ssn) - 4) + ssn[-4:]
def _mask_credit_card(self, cc: str, mask_char: str) -> str:
"""Mask credit card number"""
digits_only = re.sub(r'\D', '', cc)
if len(digits_only) >= 12:
masked_digits = mask_char * (len(digits_only) - 4) + digits_only[-4:]
# Preserve original formatting
result = cc
for i, char in enumerate(cc):
if char.isdigit():
digit_index = len([c for c in cc[:i] if c.isdigit()])
if digit_index < len(masked_digits):
result = result[:i] + masked_digits[digit_index] + result[i+1:]
return result
return mask_char * len(cc)
def _mask_ip(self, ip: str, mask_char: str) -> str:
"""Mask IP address"""
parts = ip.split('.')
if len(parts) == 4:
return f"{parts[0]}.{parts[1]}.{mask_char * len(parts[2])}.{mask_char * len(parts[3])}"
return mask_char * len(ip)
def _mask_api_key(self, api_key: str, mask_char: str) -> str:
"""Mask API key"""
if len(api_key) > 8:
return api_key[:4] + mask_char * (len(api_key) - 8) + api_key[-4:]
return mask_char * len(api_key)
def _mask_jwt_token(self, token: str, mask_char: str) -> str:
"""Mask JWT token"""
parts = token.split('.')
if len(parts) == 3:
return f"{parts[0][:8]}.{mask_char * 16}.{mask_char * 16}"
return mask_char * min(len(token), 32)
def _calculate_confidence(self, pii_type: str, value: str) -> float:
"""Calculate confidence score for PII detection"""
# Simple confidence calculation based on pattern strength
confidence_scores = {
'email': 0.95 if '@' in value and '.' in value else 0.7,
'phone': 0.9 if len(re.sub(r'\D', '', value)) == 10 else 0.7,
'ssn': 0.95 if len(re.sub(r'\D', '', value)) == 9 else 0.8,
'credit_card': 0.9 if len(re.sub(r'\D', '', value)) >= 13 else 0.7,
'ip_address': 0.85,
'api_key': 0.8 if len(value) >= 32 else 0.6,
'jwt_token': 0.95 if value.count('.') == 2 else 0.7
}
return confidence_scores.get(pii_type, 0.5)