"""Data validation utilities for migration.""" from typing import Dict, Any, Optional, Tuple from datetime import datetime, date import os from src.utils.logger import get_logger logger = get_logger(__name__) class ErrorLogger: """Log invalid migration keys to a file.""" def __init__(self, table: str, partition: str, use_timestamp: bool = False): """Initialize error logger. Args: table: Table name partition: Partition name (e.g., 'p2024' or 'incremental') use_timestamp: If True, add timestamp to filename (for incremental migrations) """ self.table = table self.partition = partition # Add timestamp to filename for incremental migrations to avoid overwriting if use_timestamp or partition == "incremental": timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") self.error_file = f"migration_errors_{table}_{partition}_{timestamp}.log" else: self.error_file = f"migration_errors_{table}_{partition}.log" self.error_count = 0 # Create error file with header with open(self.error_file, "w") as f: f.write(f"# Migration errors for {table} partition {partition}\n") f.write(f"# Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") f.write("# Format: UnitName|ToolNameID|EventDate|EventTime|Reason\n\n") logger.info(f"Error log file created: {self.error_file}") def log_invalid_key( self, unit_name: Any, tool_name_id: Any, event_date: Any, event_time: Any, reason: str ) -> None: """Log an invalid consolidation key. Args: unit_name: UnitName value tool_name_id: ToolNameID value event_date: EventDate value event_time: EventTime value reason: Reason for rejection """ with open(self.error_file, "a") as f: f.write(f"{unit_name}|{tool_name_id}|{event_date}|{event_time}|{reason}\n") self.error_count += 1 if self.error_count % 100 == 0: logger.warning(f"Logged {self.error_count} invalid keys to {self.error_file}") def get_error_count(self) -> int: """Get total number of errors logged. Returns: Number of errors logged """ return self.error_count def close(self) -> None: """Close error logger and log summary.""" if self.error_count > 0: logger.warning( f"Total invalid keys for {self.table} partition {self.partition}: " f"{self.error_count} (see {self.error_file})" ) else: logger.info(f"No invalid keys found for {self.table} partition {self.partition}") # Remove empty error file if os.path.exists(self.error_file): os.remove(self.error_file) def validate_consolidation_key( unit_name: Any, tool_name_id: Any, event_date: Any, event_time: Any ) -> Tuple[bool, Optional[str]]: """Validate a consolidation key. Args: unit_name: UnitName value tool_name_id: ToolNameID value event_date: EventDate value event_time: EventTime value Returns: Tuple of (is_valid, error_reason) If valid: (True, None) If invalid: (False, "reason description") """ # Check for NULL unit_name or tool_name_id if unit_name is None or unit_name == "": return False, "UnitName is NULL or empty" # Check for corrupted Java strings (like '[Ljava.lang.String;@...') if isinstance(unit_name, str) and unit_name.startswith("[L"): return False, f"UnitName is corrupted Java string: {unit_name}" if tool_name_id is None or tool_name_id == "": return False, "ToolNameID is NULL or empty" # Check for NULL or invalid dates if event_date is None: return False, "EventDate is NULL" # Check for invalid date like '0000-00-00' try: if isinstance(event_date, str): if event_date.startswith("0000-00-00"): return False, f"EventDate is invalid: {event_date}" # Try to parse parsed_date = datetime.strptime(event_date, "%Y-%m-%d").date() elif isinstance(event_date, (date, datetime)): parsed_date = event_date if isinstance(event_date, date) else event_date.date() # Check for zero date if parsed_date.year == 0: return False, f"EventDate year is 0: {event_date}" else: return False, f"EventDate has invalid type: {type(event_date)}" except (ValueError, AttributeError) as e: return False, f"EventDate parsing failed: {event_date} ({e})" # Check for NULL event_time if event_time is None: return False, "EventTime is NULL" return True, None def validate_mysql_row(row: Dict[str, Any]) -> Tuple[bool, Optional[str]]: """Validate a complete MySQL row for migration. Args: row: MySQL row dictionary Returns: Tuple of (is_valid, error_reason) """ # Validate consolidation key is_valid, reason = validate_consolidation_key( row.get("UnitName"), row.get("ToolNameID"), row.get("EventDate"), row.get("EventTime") ) if not is_valid: return False, reason # Check for NodeNum if row.get("NodeNum") is None: return False, "NodeNum is NULL" return True, None