feat: Add MySQL to PostgreSQL migration tool with JSONB transformation

Implement comprehensive migration solution with:
- Full and incremental migration modes
- JSONB schema transformation for RAWDATACOR and ELABDATADISP tables
- Native PostgreSQL partitioning (2014-2031)
- Optimized GIN indexes for JSONB queries
- Rich logging with progress tracking
- Complete benchmark system for MySQL vs PostgreSQL comparison
- CLI interface with multiple commands (setup, migrate, benchmark)
- Configuration management via .env file
- Error handling and retry logic
- Batch processing for performance (configurable batch size)

Database transformations:
- RAWDATACOR: 16 Val columns + units → single JSONB measurements
- ELABDATADISP: 25+ measurement fields → structured JSONB with categories

🤖 Generated with Claude Code

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-10 19:57:11 +01:00
commit 62577d3200
24 changed files with 2075 additions and 0 deletions

0
src/migrator/__init__.py Normal file
View File

View File

@@ -0,0 +1,149 @@
"""Full migration from MySQL to PostgreSQL."""
from typing import Optional
from datetime import datetime
import json
from config import get_settings, TABLE_CONFIGS
from src.connectors.mysql_connector import MySQLConnector
from src.connectors.postgres_connector import PostgreSQLConnector
from src.transformers.data_transformer import DataTransformer
from src.utils.logger import get_logger, setup_logger
from src.utils.progress import ProgressTracker
logger = get_logger(__name__)
class FullMigrator:
"""Perform full migration of a table from MySQL to PostgreSQL."""
def __init__(self, table: str):
"""Initialize migrator for a table.
Args:
table: Table name to migrate ('RAWDATACOR' or 'ELABDATADISP')
"""
if table not in TABLE_CONFIGS:
raise ValueError(f"Unknown table: {table}")
self.table = table
self.config = TABLE_CONFIGS[table]
self.settings = get_settings()
def migrate(self, dry_run: bool = False) -> int:
"""Perform full migration of the table.
Args:
dry_run: If True, log what would be done but don't modify data
Returns:
Total number of rows migrated
"""
setup_logger(__name__)
mysql_table = self.config["mysql_table"]
pg_table = self.config["postgres_table"]
logger.info(f"Starting full migration of {mysql_table} -> {pg_table}")
try:
with MySQLConnector() as mysql_conn:
# Get total row count
total_rows = mysql_conn.get_row_count(mysql_table)
logger.info(f"Total rows to migrate: {total_rows}")
if dry_run:
logger.info("[DRY RUN] Would migrate all rows")
return total_rows
with PostgreSQLConnector() as pg_conn:
# Check if table exists
if not pg_conn.table_exists(pg_table):
raise ValueError(
f"PostgreSQL table {pg_table} does not exist. "
"Run 'setup --create-schema' first."
)
migrated = 0
with ProgressTracker(
total_rows,
f"Migrating {mysql_table}"
) as progress:
# Fetch and migrate rows in batches
for batch in mysql_conn.fetch_all_rows(mysql_table):
# Transform batch
transformed = DataTransformer.transform_batch(
mysql_table,
batch
)
# Insert batch
columns = DataTransformer.get_column_order(pg_table)
inserted = pg_conn.insert_batch(
pg_table,
transformed,
columns
)
migrated += inserted
progress.update(inserted)
logger.info(
f"✓ Migration complete: {migrated} rows migrated "
f"to {pg_table}"
)
# Update migration state
self._update_migration_state(pg_conn, migrated)
return migrated
except Exception as e:
logger.error(f"Migration failed: {e}")
raise
def _update_migration_state(
self,
pg_conn: PostgreSQLConnector,
rows_migrated: int
) -> None:
"""Update migration state tracking table.
Args:
pg_conn: PostgreSQL connection
rows_migrated: Number of rows migrated
"""
try:
pg_table = self.config["postgres_table"]
query = f"""
INSERT INTO migration_state
(table_name, last_migrated_timestamp, total_rows_migrated, migration_completed_at, status)
VALUES (%s, %s, %s, %s, %s)
ON CONFLICT (table_name) DO UPDATE SET
last_migrated_timestamp = EXCLUDED.last_migrated_timestamp,
total_rows_migrated = EXCLUDED.total_rows_migrated,
migration_completed_at = EXCLUDED.migration_completed_at,
status = EXCLUDED.status
"""
now = datetime.utcnow()
pg_conn.execute(query, (pg_table, now, rows_migrated, now, "completed"))
logger.debug("Migration state updated")
except Exception as e:
logger.warning(f"Failed to update migration state: {e}")
def run_full_migration(
table: str,
dry_run: bool = False
) -> int:
"""Run full migration for a table.
Args:
table: Table name to migrate
dry_run: If True, show what would be done without modifying data
Returns:
Number of rows migrated
"""
migrator = FullMigrator(table)
return migrator.migrate(dry_run=dry_run)

View File

@@ -0,0 +1,155 @@
"""Incremental migration from MySQL to PostgreSQL based on timestamps."""
from datetime import datetime
from typing import Optional
from config import get_settings, TABLE_CONFIGS
from src.connectors.mysql_connector import MySQLConnector
from src.connectors.postgres_connector import PostgreSQLConnector
from src.transformers.data_transformer import DataTransformer
from src.utils.logger import get_logger, setup_logger
from src.utils.progress import ProgressTracker
from src.migrator.state import MigrationState
logger = get_logger(__name__)
class IncrementalMigrator:
"""Perform incremental migration based on timestamps."""
def __init__(self, table: str, state_file: str = "migration_state.json"):
"""Initialize incremental migrator.
Args:
table: Table name to migrate
state_file: Path to migration state file
"""
if table not in TABLE_CONFIGS:
raise ValueError(f"Unknown table: {table}")
self.table = table
self.config = TABLE_CONFIGS[table]
self.settings = get_settings()
self.state = MigrationState(state_file)
def migrate(self, dry_run: bool = False) -> int:
"""Perform incremental migration since last sync.
Args:
dry_run: If True, log what would be done but don't modify data
Returns:
Number of rows migrated
"""
setup_logger(__name__)
mysql_table = self.config["mysql_table"]
pg_table = self.config["postgres_table"]
# Get last migration timestamp
last_timestamp = self.state.get_last_timestamp(pg_table)
if last_timestamp is None:
logger.info(
f"No previous migration found for {pg_table}. "
"Use 'migrate --full' for initial migration."
)
return 0
logger.info(
f"Starting incremental migration of {mysql_table} -> {pg_table} "
f"since {last_timestamp}"
)
try:
with MySQLConnector() as mysql_conn:
# Count rows to migrate
timestamp_col = "updated_at" if mysql_table == "ELABDATADISP" else "created_at"
with PostgreSQLConnector() as pg_conn:
# Get max timestamp from PostgreSQL
pg_max_timestamp = pg_conn.get_max_timestamp(
pg_table,
timestamp_col
)
logger.info(f"Last timestamp in PostgreSQL: {pg_max_timestamp}")
if dry_run:
logger.info("[DRY RUN] Would migrate rows after timestamp")
return 0
migrated = 0
migration_start_time = datetime.utcnow().isoformat()
# Fetch and migrate rows in batches
batch_count = 0
for batch in mysql_conn.fetch_rows_since(
mysql_table,
last_timestamp
):
batch_count += 1
if batch_count == 1:
# Create progress tracker with unknown total
progress = ProgressTracker(
len(batch),
f"Migrating {mysql_table} (incremental)"
)
progress.__enter__()
# Transform batch
transformed = DataTransformer.transform_batch(
mysql_table,
batch
)
# Insert batch
columns = DataTransformer.get_column_order(pg_table)
inserted = pg_conn.insert_batch(
pg_table,
transformed,
columns
)
migrated += inserted
progress.update(inserted)
if batch_count == 0:
logger.info(f"No new rows to migrate for {mysql_table}")
return 0
progress.__exit__(None, None, None)
# Update migration state
self.state.set_last_timestamp(pg_table, migration_start_time)
self.state.increment_migration_count(pg_table, migrated)
logger.info(
f"✓ Incremental migration complete: {migrated} rows migrated "
f"to {pg_table}"
)
return migrated
except Exception as e:
logger.error(f"Incremental migration failed: {e}")
raise
def run_incremental_migration(
table: str,
dry_run: bool = False,
state_file: str = "migration_state.json"
) -> int:
"""Run incremental migration for a table.
Args:
table: Table name to migrate
dry_run: If True, show what would be done without modifying data
state_file: Path to migration state file
Returns:
Number of rows migrated
"""
migrator = IncrementalMigrator(table, state_file)
return migrator.migrate(dry_run=dry_run)

105
src/migrator/state.py Normal file
View File

@@ -0,0 +1,105 @@
"""Migration state management."""
import json
from pathlib import Path
from datetime import datetime
from typing import Optional, Dict, Any
from src.utils.logger import get_logger
logger = get_logger(__name__)
class MigrationState:
"""Manage migration state for incremental migrations."""
DEFAULT_STATE_FILE = "migration_state.json"
def __init__(self, state_file: str = DEFAULT_STATE_FILE):
"""Initialize migration state.
Args:
state_file: Path to state file
"""
self.state_file = Path(state_file)
self.state = self._load_state()
def _load_state(self) -> Dict[str, Any]:
"""Load state from file."""
if self.state_file.exists():
try:
with open(self.state_file, "r") as f:
return json.load(f)
except Exception as e:
logger.warning(f"Failed to load state file: {e}")
return {}
return {}
def _save_state(self) -> None:
"""Save state to file."""
try:
with open(self.state_file, "w") as f:
json.dump(self.state, f, indent=2)
except Exception as e:
logger.error(f"Failed to save state file: {e}")
raise
def get_last_timestamp(self, table: str) -> Optional[str]:
"""Get last migration timestamp for a table.
Args:
table: Table name
Returns:
ISO format timestamp or None if not found
"""
return self.state.get(table, {}).get("last_timestamp")
def set_last_timestamp(self, table: str, timestamp: str) -> None:
"""Set last migration timestamp for a table.
Args:
table: Table name
timestamp: ISO format timestamp
"""
if table not in self.state:
self.state[table] = {}
self.state[table]["last_timestamp"] = timestamp
self.state[table]["last_updated"] = datetime.utcnow().isoformat()
self._save_state()
def get_migration_count(self, table: str) -> int:
"""Get total migration count for a table.
Args:
table: Table name
Returns:
Total rows migrated
"""
return self.state.get(table, {}).get("total_migrated", 0)
def increment_migration_count(self, table: str, count: int) -> None:
"""Increment migration count for a table.
Args:
table: Table name
count: Number of rows to add
"""
if table not in self.state:
self.state[table] = {}
current = self.state[table].get("total_migrated", 0)
self.state[table]["total_migrated"] = current + count
self._save_state()
def reset(self, table: Optional[str] = None) -> None:
"""Reset migration state.
Args:
table: Table name to reset, or None to reset all
"""
if table:
self.state[table] = {}
else:
self.state = {}
self._save_state()