fix: Use actual PostgreSQL row count for total_rows_migrated tracking

Replace session-level counting with direct table COUNT queries to ensure
total_rows_migrated always reflects actual reality in PostgreSQL. This fixes
the discrepancy where the counter was only tracking rows from the current session
and didn't account for earlier insertions or duplicates from failed resume attempts.

Key improvements:
- Use get_row_count() after each batch to get authoritative total
- Preserve previous count on resume and accumulate across sessions
- Remove dependency on error-prone session-level counters
- Ensures migration_state.total_rows_migrated matches actual table row count

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-23 15:33:27 +01:00
parent b09cfcf9df
commit 0f217379ea
8 changed files with 646 additions and 100 deletions

View File

@@ -1,6 +1,7 @@
"""Incremental migration from MySQL to PostgreSQL based on timestamps."""
from datetime import datetime
from typing import Optional
import psycopg
from config import get_settings, TABLE_CONFIGS
from src.connectors.mysql_connector import MySQLConnector
@@ -31,6 +32,33 @@ class IncrementalMigrator:
self.settings = get_settings()
self.state = MigrationState(state_file)
def _get_last_timestamp_from_db(
self,
pg_conn: PostgreSQLConnector,
pg_table: str
) -> Optional[str]:
"""Get last migration timestamp from PostgreSQL migration_state table.
Args:
pg_conn: PostgreSQL connector
pg_table: PostgreSQL table name
Returns:
ISO format timestamp or None if not found
"""
try:
with pg_conn.connection.cursor() as cursor:
cursor.execute(
"SELECT last_migrated_timestamp FROM migration_state WHERE table_name = %s",
(pg_table,)
)
result = cursor.fetchone()
if result and result[0]:
return result[0].isoformat()
except psycopg.Error:
return None
return None
def migrate(self, dry_run: bool = False, use_id: bool = False) -> int:
"""Perform incremental migration since last sync.
@@ -88,9 +116,19 @@ class IncrementalMigrator:
Returns:
Number of rows migrated
"""
# Get last migration timestamp
# Try to get last migration timestamp from state file first
last_timestamp = self.state.get_last_timestamp(pg_table)
# If not in state file, try to get from PostgreSQL migration_state table
if last_timestamp is None:
try:
last_timestamp = self._get_last_timestamp_from_db(pg_conn, pg_table)
if last_timestamp:
logger.info(f"Found previous migration state in database: {last_timestamp}")
except Exception as e:
logger.debug(f"Could not read from migration_state table: {e}")
last_timestamp = None
if last_timestamp is None:
logger.info(
f"No previous migration found for {pg_table}. "
@@ -98,6 +136,8 @@ class IncrementalMigrator:
)
return 0
logger.info(f"Last migration timestamp: {last_timestamp}")
# Count rows to migrate
timestamp_col = "updated_at" if mysql_table == "ELABDATADISP" else "created_at"
@@ -107,7 +147,7 @@ class IncrementalMigrator:
timestamp_col
)
logger.info(f"Last timestamp in PostgreSQL: {pg_max_timestamp}")
logger.info(f"Current max timestamp in PostgreSQL: {pg_max_timestamp}")
if dry_run:
logger.info("[DRY RUN] Would migrate rows after timestamp")