feat: Add MySQL to PostgreSQL migration tool with JSONB transformation

Implement comprehensive migration solution with:
- Full and incremental migration modes
- JSONB schema transformation for RAWDATACOR and ELABDATADISP tables
- Native PostgreSQL partitioning (2014-2031)
- Optimized GIN indexes for JSONB queries
- Rich logging with progress tracking
- Complete benchmark system for MySQL vs PostgreSQL comparison
- CLI interface with multiple commands (setup, migrate, benchmark)
- Configuration management via .env file
- Error handling and retry logic
- Batch processing for performance (configurable batch size)

Database transformations:
- RAWDATACOR: 16 Val columns + units → single JSONB measurements
- ELABDATADISP: 25+ measurement fields → structured JSONB with categories

🤖 Generated with Claude Code

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
2025-12-10 19:57:11 +01:00
commit 62577d3200
24 changed files with 2075 additions and 0 deletions

View File

View File

@@ -0,0 +1,178 @@
"""Data transformation from MySQL to PostgreSQL format."""
from typing import Dict, Any, List
from datetime import datetime
from config import (
RAWDATACOR_COLUMNS,
ELABDATADISP_FIELD_MAPPING,
TABLE_CONFIGS,
)
from src.utils.logger import get_logger
logger = get_logger(__name__)
class DataTransformer:
"""Transform MySQL data to PostgreSQL format."""
@staticmethod
def transform_rawdatacor_row(mysql_row: Dict[str, Any]) -> Dict[str, Any]:
"""Transform a RAWDATACOR row from MySQL to PostgreSQL format.
Args:
mysql_row: Row dictionary from MySQL
Returns:
Transformed row dictionary for PostgreSQL
"""
# Create measurements JSONB
measurements = {}
# Map Val0-ValF with their units
for i, val_col in enumerate(RAWDATACOR_COLUMNS["val_columns"]):
unit_col = RAWDATACOR_COLUMNS["unit_columns"][i]
value = mysql_row.get(val_col)
unit = mysql_row.get(unit_col)
# Only add to JSONB if value is not None
if value is not None:
measurements[str(i)] = {
"value": str(value),
"unit": unit if unit else None,
}
# Create PostgreSQL row
pg_row = {
"id": mysql_row["id"],
"unit_name": mysql_row.get("UnitName"),
"tool_name_id": mysql_row["ToolNameID"],
"node_num": mysql_row["NodeNum"],
"event_date": mysql_row["EventDate"],
"event_time": mysql_row["EventTime"],
"bat_level": mysql_row["BatLevel"],
"temperature": mysql_row["Temperature"],
"measurements": measurements,
"created_at": mysql_row.get("created_at"),
"bat_level_module": mysql_row.get("BatLevelModule"),
"temperature_module": mysql_row.get("TemperatureModule"),
"rssi_module": mysql_row.get("RssiModule"),
}
return pg_row
@staticmethod
def transform_elabdatadisp_row(mysql_row: Dict[str, Any]) -> Dict[str, Any]:
"""Transform an ELABDATADISP row from MySQL to PostgreSQL format.
Args:
mysql_row: Row dictionary from MySQL
Returns:
Transformed row dictionary for PostgreSQL
"""
# Create measurements JSONB with structured categories
measurements = {
"shifts": {},
"coordinates": {},
"kinematics": {},
"sensors": {},
"calculated": {},
}
# Map all measurement fields using the configuration
for mysql_col, (category, pg_key) in ELABDATADISP_FIELD_MAPPING.items():
value = mysql_row.get(mysql_col)
if value is not None:
measurements[category][pg_key] = float(value) if isinstance(value, str) else value
# Remove empty categories
measurements = {
k: v for k, v in measurements.items() if v
}
# Create PostgreSQL row
pg_row = {
"id_elab_data": mysql_row["idElabData"],
"unit_name": mysql_row.get("UnitName"),
"tool_name_id": mysql_row["ToolNameID"],
"node_num": mysql_row["NodeNum"],
"event_date": mysql_row["EventDate"],
"event_time": mysql_row["EventTime"],
"state": mysql_row.get("State"),
"calc_err": mysql_row.get("calcerr", 0),
"measurements": measurements,
"created_at": mysql_row.get("created_at"),
"updated_at": mysql_row.get("updated_at"),
}
return pg_row
@staticmethod
def transform_batch(
table: str,
rows: List[Dict[str, Any]]
) -> List[Dict[str, Any]]:
"""Transform a batch of rows from MySQL to PostgreSQL format.
Args:
table: Table name ('RAWDATACOR' or 'ELABDATADISP')
rows: List of row dictionaries from MySQL
Returns:
List of transformed row dictionaries for PostgreSQL
"""
if table == "RAWDATACOR":
return [
DataTransformer.transform_rawdatacor_row(row)
for row in rows
]
elif table == "ELABDATADISP":
return [
DataTransformer.transform_elabdatadisp_row(row)
for row in rows
]
else:
raise ValueError(f"Unknown table: {table}")
@staticmethod
def get_column_order(table: str) -> List[str]:
"""Get the column order for inserting into PostgreSQL.
Args:
table: PostgreSQL table name
Returns:
List of column names in order
"""
if table == "rawdatacor":
return [
"id",
"unit_name",
"tool_name_id",
"node_num",
"event_date",
"event_time",
"bat_level",
"temperature",
"measurements",
"created_at",
"bat_level_module",
"temperature_module",
"rssi_module",
]
elif table == "elabdatadisp":
return [
"id_elab_data",
"unit_name",
"tool_name_id",
"node_num",
"event_date",
"event_time",
"state",
"calc_err",
"measurements",
"created_at",
"updated_at",
]
else:
raise ValueError(f"Unknown table: {table}")

View File

@@ -0,0 +1,149 @@
"""PostgreSQL schema creation from MySQL structure."""
from config import PARTITION_YEARS
from src.utils.logger import get_logger
logger = get_logger(__name__)
def create_rawdatacor_schema() -> str:
"""Create PostgreSQL schema for RAWDATACOR table.
Returns:
SQL script to create the table with partitions
"""
sql = """
-- Create RAWDATACOR table with partitioning
CREATE TABLE IF NOT EXISTS rawdatacor (
id BIGSERIAL NOT NULL,
unit_name VARCHAR(32),
tool_name_id VARCHAR(32) NOT NULL,
node_num INTEGER NOT NULL,
event_date DATE NOT NULL,
event_time TIME NOT NULL,
bat_level NUMERIC(4,2) NOT NULL,
temperature NUMERIC(5,2) NOT NULL,
measurements JSONB,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
bat_level_module NUMERIC(4,2),
temperature_module NUMERIC(5,2),
rssi_module INTEGER,
PRIMARY KEY (id, event_date)
) PARTITION BY RANGE (EXTRACT(YEAR FROM event_date));
-- Create partitions for each year
"""
# Add partition creation statements
for year in PARTITION_YEARS:
next_year = year + 1
sql += f"""
CREATE TABLE IF NOT EXISTS rawdatacor_{year}
PARTITION OF rawdatacor
FOR VALUES FROM ({year}) TO ({next_year});
"""
# Add indexes
sql += """
-- Create indexes
CREATE INDEX IF NOT EXISTS idx_unit_tool_node_datetime_raw
ON rawdatacor(unit_name, tool_name_id, node_num, event_date, event_time);
CREATE INDEX IF NOT EXISTS idx_unit_tool_raw
ON rawdatacor(unit_name, tool_name_id);
CREATE INDEX IF NOT EXISTS idx_measurements_gin_raw
ON rawdatacor USING GIN (measurements);
CREATE INDEX IF NOT EXISTS idx_event_date_raw
ON rawdatacor(event_date);
"""
return sql
def create_elabdatadisp_schema() -> str:
"""Create PostgreSQL schema for ELABDATADISP table.
Returns:
SQL script to create the table with partitions
"""
sql = """
-- Create ELABDATADISP table with partitioning
CREATE TABLE IF NOT EXISTS elabdatadisp (
id_elab_data BIGSERIAL NOT NULL,
unit_name VARCHAR(32),
tool_name_id VARCHAR(32) NOT NULL,
node_num INTEGER NOT NULL,
event_date DATE NOT NULL,
event_time TIME NOT NULL,
state VARCHAR(32),
calc_err INTEGER DEFAULT 0,
measurements JSONB,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (id_elab_data, event_date)
) PARTITION BY RANGE (EXTRACT(YEAR FROM event_date));
-- Create partitions for each year
"""
# Add partition creation statements
for year in PARTITION_YEARS:
next_year = year + 1
sql += f"""
CREATE TABLE IF NOT EXISTS elabdatadisp_{year}
PARTITION OF elabdatadisp
FOR VALUES FROM ({year}) TO ({next_year});
"""
# Add indexes
sql += """
-- Create indexes
CREATE INDEX IF NOT EXISTS idx_unit_tool_node_datetime_elab
ON elabdatadisp(unit_name, tool_name_id, node_num, event_date, event_time);
CREATE INDEX IF NOT EXISTS idx_unit_tool_elab
ON elabdatadisp(unit_name, tool_name_id);
CREATE INDEX IF NOT EXISTS idx_measurements_gin_elab
ON elabdatadisp USING GIN (measurements);
CREATE INDEX IF NOT EXISTS idx_event_date_elab
ON elabdatadisp(event_date);
"""
return sql
def create_migration_state_table() -> str:
"""Create table to track migration state.
Returns:
SQL to create migration_state table
"""
sql = """
-- Create table to track migration state
CREATE TABLE IF NOT EXISTS migration_state (
table_name VARCHAR(255) PRIMARY KEY,
last_migrated_timestamp TIMESTAMP,
last_migrated_id BIGINT,
migration_started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
migration_completed_at TIMESTAMP,
total_rows_migrated BIGINT DEFAULT 0,
status VARCHAR(32) DEFAULT 'pending'
);
"""
return sql
def get_full_schema_script() -> str:
"""Get complete schema creation script for PostgreSQL.
Returns:
Full SQL script to create all tables and indexes
"""
return (
create_rawdatacor_schema() +
"\n\n" +
create_elabdatadisp_schema() +
"\n\n" +
create_migration_state_table()
)