feat: Add MySQL to PostgreSQL migration tool with JSONB transformation
Implement comprehensive migration solution with: - Full and incremental migration modes - JSONB schema transformation for RAWDATACOR and ELABDATADISP tables - Native PostgreSQL partitioning (2014-2031) - Optimized GIN indexes for JSONB queries - Rich logging with progress tracking - Complete benchmark system for MySQL vs PostgreSQL comparison - CLI interface with multiple commands (setup, migrate, benchmark) - Configuration management via .env file - Error handling and retry logic - Batch processing for performance (configurable batch size) Database transformations: - RAWDATACOR: 16 Val columns + units → single JSONB measurements - ELABDATADISP: 25+ measurement fields → structured JSONB with categories 🤖 Generated with Claude Code Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
0
src/transformers/__init__.py
Normal file
0
src/transformers/__init__.py
Normal file
178
src/transformers/data_transformer.py
Normal file
178
src/transformers/data_transformer.py
Normal file
@@ -0,0 +1,178 @@
|
||||
"""Data transformation from MySQL to PostgreSQL format."""
|
||||
from typing import Dict, Any, List
|
||||
from datetime import datetime
|
||||
from config import (
|
||||
RAWDATACOR_COLUMNS,
|
||||
ELABDATADISP_FIELD_MAPPING,
|
||||
TABLE_CONFIGS,
|
||||
)
|
||||
from src.utils.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class DataTransformer:
|
||||
"""Transform MySQL data to PostgreSQL format."""
|
||||
|
||||
@staticmethod
|
||||
def transform_rawdatacor_row(mysql_row: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Transform a RAWDATACOR row from MySQL to PostgreSQL format.
|
||||
|
||||
Args:
|
||||
mysql_row: Row dictionary from MySQL
|
||||
|
||||
Returns:
|
||||
Transformed row dictionary for PostgreSQL
|
||||
"""
|
||||
# Create measurements JSONB
|
||||
measurements = {}
|
||||
|
||||
# Map Val0-ValF with their units
|
||||
for i, val_col in enumerate(RAWDATACOR_COLUMNS["val_columns"]):
|
||||
unit_col = RAWDATACOR_COLUMNS["unit_columns"][i]
|
||||
|
||||
value = mysql_row.get(val_col)
|
||||
unit = mysql_row.get(unit_col)
|
||||
|
||||
# Only add to JSONB if value is not None
|
||||
if value is not None:
|
||||
measurements[str(i)] = {
|
||||
"value": str(value),
|
||||
"unit": unit if unit else None,
|
||||
}
|
||||
|
||||
# Create PostgreSQL row
|
||||
pg_row = {
|
||||
"id": mysql_row["id"],
|
||||
"unit_name": mysql_row.get("UnitName"),
|
||||
"tool_name_id": mysql_row["ToolNameID"],
|
||||
"node_num": mysql_row["NodeNum"],
|
||||
"event_date": mysql_row["EventDate"],
|
||||
"event_time": mysql_row["EventTime"],
|
||||
"bat_level": mysql_row["BatLevel"],
|
||||
"temperature": mysql_row["Temperature"],
|
||||
"measurements": measurements,
|
||||
"created_at": mysql_row.get("created_at"),
|
||||
"bat_level_module": mysql_row.get("BatLevelModule"),
|
||||
"temperature_module": mysql_row.get("TemperatureModule"),
|
||||
"rssi_module": mysql_row.get("RssiModule"),
|
||||
}
|
||||
|
||||
return pg_row
|
||||
|
||||
@staticmethod
|
||||
def transform_elabdatadisp_row(mysql_row: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Transform an ELABDATADISP row from MySQL to PostgreSQL format.
|
||||
|
||||
Args:
|
||||
mysql_row: Row dictionary from MySQL
|
||||
|
||||
Returns:
|
||||
Transformed row dictionary for PostgreSQL
|
||||
"""
|
||||
# Create measurements JSONB with structured categories
|
||||
measurements = {
|
||||
"shifts": {},
|
||||
"coordinates": {},
|
||||
"kinematics": {},
|
||||
"sensors": {},
|
||||
"calculated": {},
|
||||
}
|
||||
|
||||
# Map all measurement fields using the configuration
|
||||
for mysql_col, (category, pg_key) in ELABDATADISP_FIELD_MAPPING.items():
|
||||
value = mysql_row.get(mysql_col)
|
||||
if value is not None:
|
||||
measurements[category][pg_key] = float(value) if isinstance(value, str) else value
|
||||
|
||||
# Remove empty categories
|
||||
measurements = {
|
||||
k: v for k, v in measurements.items() if v
|
||||
}
|
||||
|
||||
# Create PostgreSQL row
|
||||
pg_row = {
|
||||
"id_elab_data": mysql_row["idElabData"],
|
||||
"unit_name": mysql_row.get("UnitName"),
|
||||
"tool_name_id": mysql_row["ToolNameID"],
|
||||
"node_num": mysql_row["NodeNum"],
|
||||
"event_date": mysql_row["EventDate"],
|
||||
"event_time": mysql_row["EventTime"],
|
||||
"state": mysql_row.get("State"),
|
||||
"calc_err": mysql_row.get("calcerr", 0),
|
||||
"measurements": measurements,
|
||||
"created_at": mysql_row.get("created_at"),
|
||||
"updated_at": mysql_row.get("updated_at"),
|
||||
}
|
||||
|
||||
return pg_row
|
||||
|
||||
@staticmethod
|
||||
def transform_batch(
|
||||
table: str,
|
||||
rows: List[Dict[str, Any]]
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Transform a batch of rows from MySQL to PostgreSQL format.
|
||||
|
||||
Args:
|
||||
table: Table name ('RAWDATACOR' or 'ELABDATADISP')
|
||||
rows: List of row dictionaries from MySQL
|
||||
|
||||
Returns:
|
||||
List of transformed row dictionaries for PostgreSQL
|
||||
"""
|
||||
if table == "RAWDATACOR":
|
||||
return [
|
||||
DataTransformer.transform_rawdatacor_row(row)
|
||||
for row in rows
|
||||
]
|
||||
elif table == "ELABDATADISP":
|
||||
return [
|
||||
DataTransformer.transform_elabdatadisp_row(row)
|
||||
for row in rows
|
||||
]
|
||||
else:
|
||||
raise ValueError(f"Unknown table: {table}")
|
||||
|
||||
@staticmethod
|
||||
def get_column_order(table: str) -> List[str]:
|
||||
"""Get the column order for inserting into PostgreSQL.
|
||||
|
||||
Args:
|
||||
table: PostgreSQL table name
|
||||
|
||||
Returns:
|
||||
List of column names in order
|
||||
"""
|
||||
if table == "rawdatacor":
|
||||
return [
|
||||
"id",
|
||||
"unit_name",
|
||||
"tool_name_id",
|
||||
"node_num",
|
||||
"event_date",
|
||||
"event_time",
|
||||
"bat_level",
|
||||
"temperature",
|
||||
"measurements",
|
||||
"created_at",
|
||||
"bat_level_module",
|
||||
"temperature_module",
|
||||
"rssi_module",
|
||||
]
|
||||
elif table == "elabdatadisp":
|
||||
return [
|
||||
"id_elab_data",
|
||||
"unit_name",
|
||||
"tool_name_id",
|
||||
"node_num",
|
||||
"event_date",
|
||||
"event_time",
|
||||
"state",
|
||||
"calc_err",
|
||||
"measurements",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
]
|
||||
else:
|
||||
raise ValueError(f"Unknown table: {table}")
|
||||
149
src/transformers/schema_transformer.py
Normal file
149
src/transformers/schema_transformer.py
Normal file
@@ -0,0 +1,149 @@
|
||||
"""PostgreSQL schema creation from MySQL structure."""
|
||||
from config import PARTITION_YEARS
|
||||
from src.utils.logger import get_logger
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def create_rawdatacor_schema() -> str:
|
||||
"""Create PostgreSQL schema for RAWDATACOR table.
|
||||
|
||||
Returns:
|
||||
SQL script to create the table with partitions
|
||||
"""
|
||||
sql = """
|
||||
-- Create RAWDATACOR table with partitioning
|
||||
CREATE TABLE IF NOT EXISTS rawdatacor (
|
||||
id BIGSERIAL NOT NULL,
|
||||
unit_name VARCHAR(32),
|
||||
tool_name_id VARCHAR(32) NOT NULL,
|
||||
node_num INTEGER NOT NULL,
|
||||
event_date DATE NOT NULL,
|
||||
event_time TIME NOT NULL,
|
||||
bat_level NUMERIC(4,2) NOT NULL,
|
||||
temperature NUMERIC(5,2) NOT NULL,
|
||||
measurements JSONB,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
bat_level_module NUMERIC(4,2),
|
||||
temperature_module NUMERIC(5,2),
|
||||
rssi_module INTEGER,
|
||||
PRIMARY KEY (id, event_date)
|
||||
) PARTITION BY RANGE (EXTRACT(YEAR FROM event_date));
|
||||
|
||||
-- Create partitions for each year
|
||||
"""
|
||||
# Add partition creation statements
|
||||
for year in PARTITION_YEARS:
|
||||
next_year = year + 1
|
||||
sql += f"""
|
||||
CREATE TABLE IF NOT EXISTS rawdatacor_{year}
|
||||
PARTITION OF rawdatacor
|
||||
FOR VALUES FROM ({year}) TO ({next_year});
|
||||
"""
|
||||
|
||||
# Add indexes
|
||||
sql += """
|
||||
-- Create indexes
|
||||
CREATE INDEX IF NOT EXISTS idx_unit_tool_node_datetime_raw
|
||||
ON rawdatacor(unit_name, tool_name_id, node_num, event_date, event_time);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_unit_tool_raw
|
||||
ON rawdatacor(unit_name, tool_name_id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_measurements_gin_raw
|
||||
ON rawdatacor USING GIN (measurements);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_event_date_raw
|
||||
ON rawdatacor(event_date);
|
||||
"""
|
||||
|
||||
return sql
|
||||
|
||||
|
||||
def create_elabdatadisp_schema() -> str:
|
||||
"""Create PostgreSQL schema for ELABDATADISP table.
|
||||
|
||||
Returns:
|
||||
SQL script to create the table with partitions
|
||||
"""
|
||||
sql = """
|
||||
-- Create ELABDATADISP table with partitioning
|
||||
CREATE TABLE IF NOT EXISTS elabdatadisp (
|
||||
id_elab_data BIGSERIAL NOT NULL,
|
||||
unit_name VARCHAR(32),
|
||||
tool_name_id VARCHAR(32) NOT NULL,
|
||||
node_num INTEGER NOT NULL,
|
||||
event_date DATE NOT NULL,
|
||||
event_time TIME NOT NULL,
|
||||
state VARCHAR(32),
|
||||
calc_err INTEGER DEFAULT 0,
|
||||
measurements JSONB,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (id_elab_data, event_date)
|
||||
) PARTITION BY RANGE (EXTRACT(YEAR FROM event_date));
|
||||
|
||||
-- Create partitions for each year
|
||||
"""
|
||||
# Add partition creation statements
|
||||
for year in PARTITION_YEARS:
|
||||
next_year = year + 1
|
||||
sql += f"""
|
||||
CREATE TABLE IF NOT EXISTS elabdatadisp_{year}
|
||||
PARTITION OF elabdatadisp
|
||||
FOR VALUES FROM ({year}) TO ({next_year});
|
||||
"""
|
||||
|
||||
# Add indexes
|
||||
sql += """
|
||||
-- Create indexes
|
||||
CREATE INDEX IF NOT EXISTS idx_unit_tool_node_datetime_elab
|
||||
ON elabdatadisp(unit_name, tool_name_id, node_num, event_date, event_time);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_unit_tool_elab
|
||||
ON elabdatadisp(unit_name, tool_name_id);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_measurements_gin_elab
|
||||
ON elabdatadisp USING GIN (measurements);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_event_date_elab
|
||||
ON elabdatadisp(event_date);
|
||||
"""
|
||||
|
||||
return sql
|
||||
|
||||
|
||||
def create_migration_state_table() -> str:
|
||||
"""Create table to track migration state.
|
||||
|
||||
Returns:
|
||||
SQL to create migration_state table
|
||||
"""
|
||||
sql = """
|
||||
-- Create table to track migration state
|
||||
CREATE TABLE IF NOT EXISTS migration_state (
|
||||
table_name VARCHAR(255) PRIMARY KEY,
|
||||
last_migrated_timestamp TIMESTAMP,
|
||||
last_migrated_id BIGINT,
|
||||
migration_started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
migration_completed_at TIMESTAMP,
|
||||
total_rows_migrated BIGINT DEFAULT 0,
|
||||
status VARCHAR(32) DEFAULT 'pending'
|
||||
);
|
||||
"""
|
||||
return sql
|
||||
|
||||
|
||||
def get_full_schema_script() -> str:
|
||||
"""Get complete schema creation script for PostgreSQL.
|
||||
|
||||
Returns:
|
||||
Full SQL script to create all tables and indexes
|
||||
"""
|
||||
return (
|
||||
create_rawdatacor_schema() +
|
||||
"\n\n" +
|
||||
create_elabdatadisp_schema() +
|
||||
"\n\n" +
|
||||
create_migration_state_table()
|
||||
)
|
||||
Reference in New Issue
Block a user