Files
mysql2postgres/src/utils/validation.py
alex 23e9fc9d82 feat: Add error logging and fix incremental migration state tracking
Implement comprehensive error handling and fix state management bug in incremental migration:

Error Logging System:
- Add validation for consolidation keys (NULL dates, empty IDs, corrupted Java strings)
- Log invalid keys to dedicated error files with detailed reasons
- Full migration: migration_errors_<table>_<partition>.log
- Incremental migration: migration_errors_<table>_incremental_<timestamp>.log (timestamped to preserve history)
- Report total count of skipped invalid keys at migration completion
- Auto-delete empty error log files

State Tracking Fix:
- Fix critical bug where last_key wasn't updated after final buffer flush
- Track last_processed_key throughout migration loop
- Update state both during periodic flushes and after final flush
- Ensures incremental migration correctly resumes from last migrated key

Validation Checks:
- EventDate IS NULL or EventDate = '0000-00-00'
- EventTime IS NULL
- ToolNameID IS NULL or empty string
- UnitName IS NULL or empty string
- UnitName starting with '[L' (corrupted Java strings)

Documentation:
- Update README.md with error logging behavior
- Update MIGRATION_WORKFLOW.md with validation details
- Update CHANGELOG.md with new features and fixes

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-01 19:49:44 +01:00

171 lines
5.4 KiB
Python

"""Data validation utilities for migration."""
from typing import Dict, Any, Optional, Tuple
from datetime import datetime, date
import os
from src.utils.logger import get_logger
logger = get_logger(__name__)
class ErrorLogger:
"""Log invalid migration keys to a file."""
def __init__(self, table: str, partition: str, use_timestamp: bool = False):
"""Initialize error logger.
Args:
table: Table name
partition: Partition name (e.g., 'p2024' or 'incremental')
use_timestamp: If True, add timestamp to filename (for incremental migrations)
"""
self.table = table
self.partition = partition
# Add timestamp to filename for incremental migrations to avoid overwriting
if use_timestamp or partition == "incremental":
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
self.error_file = f"migration_errors_{table}_{partition}_{timestamp}.log"
else:
self.error_file = f"migration_errors_{table}_{partition}.log"
self.error_count = 0
# Create error file with header
with open(self.error_file, "w") as f:
f.write(f"# Migration errors for {table} partition {partition}\n")
f.write(f"# Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write("# Format: UnitName|ToolNameID|EventDate|EventTime|Reason\n\n")
logger.info(f"Error log file created: {self.error_file}")
def log_invalid_key(
self,
unit_name: Any,
tool_name_id: Any,
event_date: Any,
event_time: Any,
reason: str
) -> None:
"""Log an invalid consolidation key.
Args:
unit_name: UnitName value
tool_name_id: ToolNameID value
event_date: EventDate value
event_time: EventTime value
reason: Reason for rejection
"""
with open(self.error_file, "a") as f:
f.write(f"{unit_name}|{tool_name_id}|{event_date}|{event_time}|{reason}\n")
self.error_count += 1
if self.error_count % 100 == 0:
logger.warning(f"Logged {self.error_count} invalid keys to {self.error_file}")
def get_error_count(self) -> int:
"""Get total number of errors logged.
Returns:
Number of errors logged
"""
return self.error_count
def close(self) -> None:
"""Close error logger and log summary."""
if self.error_count > 0:
logger.warning(
f"Total invalid keys for {self.table} partition {self.partition}: "
f"{self.error_count} (see {self.error_file})"
)
else:
logger.info(f"No invalid keys found for {self.table} partition {self.partition}")
# Remove empty error file
if os.path.exists(self.error_file):
os.remove(self.error_file)
def validate_consolidation_key(
unit_name: Any,
tool_name_id: Any,
event_date: Any,
event_time: Any
) -> Tuple[bool, Optional[str]]:
"""Validate a consolidation key.
Args:
unit_name: UnitName value
tool_name_id: ToolNameID value
event_date: EventDate value
event_time: EventTime value
Returns:
Tuple of (is_valid, error_reason)
If valid: (True, None)
If invalid: (False, "reason description")
"""
# Check for NULL unit_name or tool_name_id
if unit_name is None or unit_name == "":
return False, "UnitName is NULL or empty"
# Check for corrupted Java strings (like '[Ljava.lang.String;@...')
if isinstance(unit_name, str) and unit_name.startswith("[L"):
return False, f"UnitName is corrupted Java string: {unit_name}"
if tool_name_id is None or tool_name_id == "":
return False, "ToolNameID is NULL or empty"
# Check for NULL or invalid dates
if event_date is None:
return False, "EventDate is NULL"
# Check for invalid date like '0000-00-00'
try:
if isinstance(event_date, str):
if event_date.startswith("0000-00-00"):
return False, f"EventDate is invalid: {event_date}"
# Try to parse
parsed_date = datetime.strptime(event_date, "%Y-%m-%d").date()
elif isinstance(event_date, (date, datetime)):
parsed_date = event_date if isinstance(event_date, date) else event_date.date()
# Check for zero date
if parsed_date.year == 0:
return False, f"EventDate year is 0: {event_date}"
else:
return False, f"EventDate has invalid type: {type(event_date)}"
except (ValueError, AttributeError) as e:
return False, f"EventDate parsing failed: {event_date} ({e})"
# Check for NULL event_time
if event_time is None:
return False, "EventTime is NULL"
return True, None
def validate_mysql_row(row: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
"""Validate a complete MySQL row for migration.
Args:
row: MySQL row dictionary
Returns:
Tuple of (is_valid, error_reason)
"""
# Validate consolidation key
is_valid, reason = validate_consolidation_key(
row.get("UnitName"),
row.get("ToolNameID"),
row.get("EventDate"),
row.get("EventTime")
)
if not is_valid:
return False, reason
# Check for NodeNum
if row.get("NodeNum") is None:
return False, "NodeNum is NULL"
return True, None