Files
mysql2postgres/src/migrator/incremental_migration.py
alex 0f217379ea fix: Use actual PostgreSQL row count for total_rows_migrated tracking
Replace session-level counting with direct table COUNT queries to ensure
total_rows_migrated always reflects actual reality in PostgreSQL. This fixes
the discrepancy where the counter was only tracking rows from the current session
and didn't account for earlier insertions or duplicates from failed resume attempts.

Key improvements:
- Use get_row_count() after each batch to get authoritative total
- Preserve previous count on resume and accumulate across sessions
- Remove dependency on error-prone session-level counters
- Ensures migration_state.total_rows_migrated matches actual table row count

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-23 15:33:27 +01:00

327 lines
11 KiB
Python

"""Incremental migration from MySQL to PostgreSQL based on timestamps."""
from datetime import datetime
from typing import Optional
import psycopg
from config import get_settings, TABLE_CONFIGS
from src.connectors.mysql_connector import MySQLConnector
from src.connectors.postgres_connector import PostgreSQLConnector
from src.transformers.data_transformer import DataTransformer
from src.utils.logger import get_logger, setup_logger
from src.utils.progress import ProgressTracker
from src.migrator.state import MigrationState
logger = get_logger(__name__)
class IncrementalMigrator:
"""Perform incremental migration based on timestamps."""
def __init__(self, table: str, state_file: str = "migration_state.json"):
"""Initialize incremental migrator.
Args:
table: Table name to migrate
state_file: Path to migration state file
"""
if table not in TABLE_CONFIGS:
raise ValueError(f"Unknown table: {table}")
self.table = table
self.config = TABLE_CONFIGS[table]
self.settings = get_settings()
self.state = MigrationState(state_file)
def _get_last_timestamp_from_db(
self,
pg_conn: PostgreSQLConnector,
pg_table: str
) -> Optional[str]:
"""Get last migration timestamp from PostgreSQL migration_state table.
Args:
pg_conn: PostgreSQL connector
pg_table: PostgreSQL table name
Returns:
ISO format timestamp or None if not found
"""
try:
with pg_conn.connection.cursor() as cursor:
cursor.execute(
"SELECT last_migrated_timestamp FROM migration_state WHERE table_name = %s",
(pg_table,)
)
result = cursor.fetchone()
if result and result[0]:
return result[0].isoformat()
except psycopg.Error:
return None
return None
def migrate(self, dry_run: bool = False, use_id: bool = False) -> int:
"""Perform incremental migration since last sync.
Args:
dry_run: If True, log what would be done but don't modify data
use_id: If True, use ID-based resumption, else use timestamp-based
Returns:
Number of rows migrated
"""
setup_logger(__name__)
mysql_table = self.config["mysql_table"]
pg_table = self.config["postgres_table"]
primary_key = self.config.get("primary_key", "id")
logger.info(
f"Starting incremental migration of {mysql_table} -> {pg_table} "
f"({'ID-based' if use_id else 'timestamp-based'})"
)
try:
with MySQLConnector() as mysql_conn:
with PostgreSQLConnector() as pg_conn:
if use_id:
return self._migrate_by_id(
mysql_conn, pg_conn, mysql_table, pg_table, primary_key, dry_run
)
else:
return self._migrate_by_timestamp(
mysql_conn, pg_conn, mysql_table, pg_table, dry_run
)
except Exception as e:
logger.error(f"Incremental migration failed: {e}")
raise
def _migrate_by_timestamp(
self,
mysql_conn: MySQLConnector,
pg_conn: PostgreSQLConnector,
mysql_table: str,
pg_table: str,
dry_run: bool
) -> int:
"""Migrate rows using timestamp-based resumption.
Args:
mysql_conn: MySQL connector
pg_conn: PostgreSQL connector
mysql_table: MySQL table name
pg_table: PostgreSQL table name
dry_run: If True, don't modify data
Returns:
Number of rows migrated
"""
# Try to get last migration timestamp from state file first
last_timestamp = self.state.get_last_timestamp(pg_table)
# If not in state file, try to get from PostgreSQL migration_state table
if last_timestamp is None:
try:
last_timestamp = self._get_last_timestamp_from_db(pg_conn, pg_table)
if last_timestamp:
logger.info(f"Found previous migration state in database: {last_timestamp}")
except Exception as e:
logger.debug(f"Could not read from migration_state table: {e}")
last_timestamp = None
if last_timestamp is None:
logger.info(
f"No previous migration found for {pg_table}. "
"Use 'migrate --full' for initial migration."
)
return 0
logger.info(f"Last migration timestamp: {last_timestamp}")
# Count rows to migrate
timestamp_col = "updated_at" if mysql_table == "ELABDATADISP" else "created_at"
# Get max timestamp from PostgreSQL
pg_max_timestamp = pg_conn.get_max_timestamp(
pg_table,
timestamp_col
)
logger.info(f"Current max timestamp in PostgreSQL: {pg_max_timestamp}")
if dry_run:
logger.info("[DRY RUN] Would migrate rows after timestamp")
return 0
migrated = 0
migration_start_time = datetime.utcnow().isoformat()
# Fetch and migrate rows in batches
batch_count = 0
for batch in mysql_conn.fetch_rows_since(
mysql_table,
last_timestamp
):
batch_count += 1
if batch_count == 1:
# Create progress tracker with unknown total
progress = ProgressTracker(
len(batch),
f"Migrating {mysql_table} (incremental)"
)
progress.__enter__()
# Transform batch
transformed = DataTransformer.transform_batch(
mysql_table,
batch
)
# Insert batch
columns = DataTransformer.get_column_order(pg_table)
inserted = pg_conn.insert_batch(
pg_table,
transformed,
columns
)
migrated += inserted
progress.update(inserted)
if batch_count == 0:
logger.info(f"No new rows to migrate for {mysql_table}")
return 0
progress.__exit__(None, None, None)
# Update migration state
self.state.set_last_timestamp(pg_table, migration_start_time)
self.state.increment_migration_count(pg_table, migrated)
logger.info(
f"✓ Incremental migration complete: {migrated} rows migrated "
f"to {pg_table}"
)
return migrated
def _migrate_by_id(
self,
mysql_conn: MySQLConnector,
pg_conn: PostgreSQLConnector,
mysql_table: str,
pg_table: str,
primary_key: str,
dry_run: bool
) -> int:
"""Migrate rows using ID-based resumption (resumable from last ID).
Args:
mysql_conn: MySQL connector
pg_conn: PostgreSQL connector
mysql_table: MySQL table name
pg_table: PostgreSQL table name
primary_key: Primary key column name
dry_run: If True, don't modify data
Returns:
Number of rows migrated
"""
# Get last migrated ID from state
total_count = mysql_conn.get_row_count(mysql_table)
state_dict = self.state.state.get(pg_table, {})
last_id = state_dict.get("last_id")
previously_migrated = state_dict.get("total_migrated", 0)
if last_id is None:
logger.info(
f"No previous ID-based migration found for {pg_table}. "
"Starting from beginning."
)
remaining = total_count
else:
remaining = total_count - last_id
logger.info(
f"Resuming ID-based migration from ID > {last_id}\n"
f"Previously migrated: {previously_migrated} rows\n"
f"Remaining to migrate: {remaining} rows"
)
if dry_run:
logger.info(f"[DRY RUN] Would migrate {remaining} rows")
return remaining
migrated = 0
with ProgressTracker(
remaining,
f"Migrating {mysql_table} (resumable)"
) as progress:
# Fetch and migrate rows in batches
for batch in mysql_conn.fetch_rows_from_id(
mysql_table,
primary_key,
last_id
):
if not batch:
break
# Transform batch
transformed = DataTransformer.transform_batch(
mysql_table,
batch
)
# Insert batch
columns = DataTransformer.get_column_order(pg_table)
inserted = pg_conn.insert_batch(
pg_table,
transformed,
columns
)
if inserted > 0:
# Get the max ID from the batch
batch_max_id = max(
int(row.get(primary_key, 0)) for row in batch
)
migrated += inserted
progress.update(inserted)
# Update state after each batch
if pg_table not in self.state.state:
self.state.state[pg_table] = {}
self.state.state[pg_table]["last_id"] = batch_max_id
self.state.state[pg_table]["total_migrated"] = previously_migrated + migrated
self.state.state[pg_table]["last_updated"] = datetime.utcnow().isoformat()
self.state._save_state()
logger.info(
f"✓ ID-based incremental migration complete: {migrated} rows migrated "
f"to {pg_table}"
)
return migrated
def run_incremental_migration(
table: str,
dry_run: bool = False,
state_file: str = "migration_state.json",
use_id: bool = False
) -> int:
"""Run incremental migration for a table.
Args:
table: Table name to migrate
dry_run: If True, show what would be done without modifying data
state_file: Path to migration state file
use_id: If True, use ID-based resumption, else use timestamp-based
Returns:
Number of rows migrated
"""
migrator = IncrementalMigrator(table, state_file)
return migrator.migrate(dry_run=dry_run, use_id=use_id)