feat: Add MySQL to PostgreSQL migration tool with JSONB transformation
Implement comprehensive migration solution with: - Full and incremental migration modes - JSONB schema transformation for RAWDATACOR and ELABDATADISP tables - Native PostgreSQL partitioning (2014-2031) - Optimized GIN indexes for JSONB queries - Rich logging with progress tracking - Complete benchmark system for MySQL vs PostgreSQL comparison - CLI interface with multiple commands (setup, migrate, benchmark) - Configuration management via .env file - Error handling and retry logic - Batch processing for performance (configurable batch size) Database transformations: - RAWDATACOR: 16 Val columns + units → single JSONB measurements - ELABDATADISP: 25+ measurement fields → structured JSONB with categories 🤖 Generated with Claude Code Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
166
src/connectors/mysql_connector.py
Normal file
166
src/connectors/mysql_connector.py
Normal file
@@ -0,0 +1,166 @@
|
||||
"""MySQL database connector."""
|
||||
import pymysql
|
||||
from typing import List, Dict, Any, Optional, Generator
|
||||
from config import get_settings
|
||||
from src.utils.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class MySQLConnector:
|
||||
"""Connector for MySQL database."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize MySQL connector with settings."""
|
||||
self.settings = get_settings()
|
||||
self.connection = None
|
||||
|
||||
def connect(self) -> None:
|
||||
"""Establish connection to MySQL database."""
|
||||
try:
|
||||
self.connection = pymysql.connect(
|
||||
host=self.settings.mysql.host,
|
||||
port=self.settings.mysql.port,
|
||||
user=self.settings.mysql.user,
|
||||
password=self.settings.mysql.password,
|
||||
database=self.settings.mysql.database,
|
||||
charset="utf8mb4",
|
||||
cursorclass=pymysql.cursors.DictCursor,
|
||||
)
|
||||
logger.info(
|
||||
f"Connected to MySQL: {self.settings.mysql.host}:"
|
||||
f"{self.settings.mysql.port}/{self.settings.mysql.database}"
|
||||
)
|
||||
except pymysql.Error as e:
|
||||
logger.error(f"Failed to connect to MySQL: {e}")
|
||||
raise
|
||||
|
||||
def disconnect(self) -> None:
|
||||
"""Close connection to MySQL database."""
|
||||
if self.connection:
|
||||
self.connection.close()
|
||||
logger.info("Disconnected from MySQL")
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry."""
|
||||
self.connect()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit."""
|
||||
self.disconnect()
|
||||
|
||||
def get_row_count(self, table: str) -> int:
|
||||
"""Get total row count for a table.
|
||||
|
||||
Args:
|
||||
table: Table name
|
||||
|
||||
Returns:
|
||||
Number of rows in the table
|
||||
"""
|
||||
try:
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(f"SELECT COUNT(*) as count FROM `{table}`")
|
||||
result = cursor.fetchone()
|
||||
return result["count"]
|
||||
except pymysql.Error as e:
|
||||
logger.error(f"Failed to get row count for {table}: {e}")
|
||||
raise
|
||||
|
||||
def fetch_all_rows(
|
||||
self,
|
||||
table: str,
|
||||
batch_size: Optional[int] = None
|
||||
) -> Generator[List[Dict[str, Any]], None, None]:
|
||||
"""Fetch all rows from a table in batches.
|
||||
|
||||
Args:
|
||||
table: Table name
|
||||
batch_size: Number of rows per batch (uses config default if None)
|
||||
|
||||
Yields:
|
||||
Batches of row dictionaries
|
||||
"""
|
||||
if batch_size is None:
|
||||
batch_size = self.settings.migration.batch_size
|
||||
|
||||
offset = 0
|
||||
while True:
|
||||
try:
|
||||
with self.connection.cursor() as cursor:
|
||||
query = f"SELECT * FROM `{table}` LIMIT %s OFFSET %s"
|
||||
cursor.execute(query, (batch_size, offset))
|
||||
rows = cursor.fetchall()
|
||||
|
||||
if not rows:
|
||||
break
|
||||
|
||||
yield rows
|
||||
offset += len(rows)
|
||||
|
||||
except pymysql.Error as e:
|
||||
logger.error(f"Failed to fetch rows from {table}: {e}")
|
||||
raise
|
||||
|
||||
def fetch_rows_since(
|
||||
self,
|
||||
table: str,
|
||||
since_timestamp: str,
|
||||
batch_size: Optional[int] = None
|
||||
) -> Generator[List[Dict[str, Any]], None, None]:
|
||||
"""Fetch rows modified since a timestamp.
|
||||
|
||||
Args:
|
||||
table: Table name
|
||||
since_timestamp: ISO format timestamp (e.g., '2024-01-01T00:00:00')
|
||||
batch_size: Number of rows per batch (uses config default if None)
|
||||
|
||||
Yields:
|
||||
Batches of row dictionaries
|
||||
"""
|
||||
if batch_size is None:
|
||||
batch_size = self.settings.migration.batch_size
|
||||
|
||||
offset = 0
|
||||
timestamp_col = "updated_at" if table == "ELABDATADISP" else "created_at"
|
||||
|
||||
while True:
|
||||
try:
|
||||
with self.connection.cursor() as cursor:
|
||||
query = (
|
||||
f"SELECT * FROM `{table}` "
|
||||
f"WHERE `{timestamp_col}` > %s "
|
||||
f"ORDER BY `{timestamp_col}` ASC "
|
||||
f"LIMIT %s OFFSET %s"
|
||||
)
|
||||
cursor.execute(query, (since_timestamp, batch_size, offset))
|
||||
rows = cursor.fetchall()
|
||||
|
||||
if not rows:
|
||||
break
|
||||
|
||||
yield rows
|
||||
offset += len(rows)
|
||||
|
||||
except pymysql.Error as e:
|
||||
logger.error(f"Failed to fetch rows from {table}: {e}")
|
||||
raise
|
||||
|
||||
def get_table_structure(self, table: str) -> Dict[str, Any]:
|
||||
"""Get table structure (column info).
|
||||
|
||||
Args:
|
||||
table: Table name
|
||||
|
||||
Returns:
|
||||
Dictionary with column information
|
||||
"""
|
||||
try:
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(f"DESCRIBE `{table}`")
|
||||
columns = cursor.fetchall()
|
||||
return {col["Field"]: col for col in columns}
|
||||
except pymysql.Error as e:
|
||||
logger.error(f"Failed to get structure for {table}: {e}")
|
||||
raise
|
||||
Reference in New Issue
Block a user