Configuration improvements: - Set read_timeout=300 (5 minutes) to handle long queries - Set write_timeout=300 (5 minutes) for writes - Set max_allowed_packet=64MB to handle larger data transfers Retry logic: - Added retry mechanism with max 3 retries on fetch failure - Auto-reconnect on connection loss before retry - Better error messages showing retry attempts This fixes the 'connection is lost' error that occurs during long-running migrations by: 1. Giving MySQL queries more time to complete 2. Allowing larger packet sizes for bulk data 3. Automatically recovering from connection drops Fixes: 'Connection is lost' error during full migration
249 lines
8.6 KiB
Python
249 lines
8.6 KiB
Python
"""Data transformation from MySQL to PostgreSQL format."""
|
|
from typing import Dict, Any, List
|
|
from datetime import datetime, time, timedelta
|
|
from config import (
|
|
RAWDATACOR_COLUMNS,
|
|
ELABDATADISP_FIELD_MAPPING,
|
|
TABLE_CONFIGS,
|
|
)
|
|
from src.utils.logger import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class DataTransformer:
|
|
"""Transform MySQL data to PostgreSQL format."""
|
|
|
|
@staticmethod
|
|
def _convert_time(event_time: Any) -> time:
|
|
"""Convert event_time to datetime.time object.
|
|
|
|
Handles multiple input types:
|
|
- str: Parse from "HH:MM:SS" format
|
|
- timedelta: Convert from MySQL TIME type (stored as timedelta)
|
|
- time: Return as-is
|
|
|
|
Args:
|
|
event_time: Time value from MySQL (str, timedelta, or time)
|
|
|
|
Returns:
|
|
datetime.time object
|
|
"""
|
|
if isinstance(event_time, str):
|
|
return datetime.strptime(event_time, "%H:%M:%S").time()
|
|
elif isinstance(event_time, timedelta):
|
|
# MySQL returns TIME as timedelta
|
|
# Extract seconds from timedelta and convert to time
|
|
total_seconds = int(event_time.total_seconds())
|
|
hours = total_seconds // 3600
|
|
minutes = (total_seconds % 3600) // 60
|
|
seconds = total_seconds % 60
|
|
return time(hour=hours, minute=minutes, second=seconds)
|
|
elif isinstance(event_time, time):
|
|
return event_time
|
|
else:
|
|
raise ValueError(f"Unsupported event_time type: {type(event_time)}")
|
|
|
|
@staticmethod
|
|
def transform_rawdatacor_row(mysql_row: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Transform a RAWDATACOR row from MySQL to PostgreSQL format.
|
|
|
|
Args:
|
|
mysql_row: Row dictionary from MySQL
|
|
|
|
Returns:
|
|
Transformed row dictionary for PostgreSQL
|
|
"""
|
|
# Create measurements JSONB
|
|
measurements = {}
|
|
|
|
# Map Val0-ValF with their units
|
|
for i, val_col in enumerate(RAWDATACOR_COLUMNS["val_columns"]):
|
|
unit_col = RAWDATACOR_COLUMNS["unit_columns"][i]
|
|
|
|
value = mysql_row.get(val_col)
|
|
unit = mysql_row.get(unit_col)
|
|
|
|
# Only add to JSONB if value is not None
|
|
if value is not None:
|
|
measurements[str(i)] = {
|
|
"value": str(value),
|
|
"unit": unit if unit else None,
|
|
}
|
|
|
|
# Combine event_date and event_time into event_timestamp
|
|
event_date = mysql_row.get("EventDate")
|
|
event_time = mysql_row.get("EventTime")
|
|
if event_date is not None and event_time is not None:
|
|
event_time_obj = DataTransformer._convert_time(event_time)
|
|
event_timestamp = datetime.combine(event_date, event_time_obj)
|
|
elif event_date is None or event_time is None:
|
|
# Log a warning for records with missing date/time
|
|
missing = []
|
|
if event_date is None:
|
|
missing.append("EventDate")
|
|
if event_time is None:
|
|
missing.append("EventTime")
|
|
logger.warning(
|
|
f"Row {mysql_row.get('id')} has NULL {', '.join(missing)}. "
|
|
f"Using default timestamp: 1970-01-01 00:00:00"
|
|
)
|
|
# Use default timestamp for records with missing date/time
|
|
event_timestamp = datetime(1970, 1, 1, 0, 0, 0)
|
|
else:
|
|
event_timestamp = None
|
|
|
|
# Create PostgreSQL row
|
|
pg_row = {
|
|
"id": mysql_row["id"],
|
|
"unit_name": mysql_row.get("UnitName"),
|
|
"tool_name_id": mysql_row["ToolNameID"],
|
|
"node_num": mysql_row["NodeNum"],
|
|
"event_timestamp": event_timestamp,
|
|
"bat_level": mysql_row["BatLevel"],
|
|
"temperature": mysql_row["Temperature"],
|
|
"measurements": measurements,
|
|
"created_at": mysql_row.get("created_at"),
|
|
"bat_level_module": mysql_row.get("BatLevelModule"),
|
|
"temperature_module": mysql_row.get("TemperatureModule"),
|
|
"rssi_module": mysql_row.get("RssiModule"),
|
|
}
|
|
|
|
return pg_row
|
|
|
|
@staticmethod
|
|
def transform_elabdatadisp_row(mysql_row: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""Transform an ELABDATADISP row from MySQL to PostgreSQL format.
|
|
|
|
Args:
|
|
mysql_row: Row dictionary from MySQL
|
|
|
|
Returns:
|
|
Transformed row dictionary for PostgreSQL
|
|
"""
|
|
# Create measurements JSONB with structured categories
|
|
measurements = {
|
|
"shifts": {},
|
|
"coordinates": {},
|
|
"kinematics": {},
|
|
"sensors": {},
|
|
"calculated": {},
|
|
}
|
|
|
|
# Map all measurement fields using the configuration
|
|
for mysql_col, (category, pg_key) in ELABDATADISP_FIELD_MAPPING.items():
|
|
value = mysql_row.get(mysql_col)
|
|
if value is not None:
|
|
measurements[category][pg_key] = float(value) if isinstance(value, str) else value
|
|
|
|
# Remove empty categories
|
|
measurements = {
|
|
k: v for k, v in measurements.items() if v
|
|
}
|
|
|
|
# Combine event_date and event_time into event_timestamp
|
|
event_date = mysql_row.get("EventDate")
|
|
event_time = mysql_row.get("EventTime")
|
|
if event_date is not None and event_time is not None:
|
|
event_time_obj = DataTransformer._convert_time(event_time)
|
|
event_timestamp = datetime.combine(event_date, event_time_obj)
|
|
elif event_date is None or event_time is None:
|
|
# Log a warning for records with missing date/time
|
|
missing = []
|
|
if event_date is None:
|
|
missing.append("EventDate")
|
|
if event_time is None:
|
|
missing.append("EventTime")
|
|
logger.warning(
|
|
f"Row {mysql_row.get('idElabData')} has NULL {', '.join(missing)}. "
|
|
f"Using default timestamp: 1970-01-01 00:00:00"
|
|
)
|
|
# Use default timestamp for records with missing date/time
|
|
event_timestamp = datetime(1970, 1, 1, 0, 0, 0)
|
|
else:
|
|
event_timestamp = None
|
|
|
|
# Create PostgreSQL row
|
|
pg_row = {
|
|
"id_elab_data": mysql_row["idElabData"],
|
|
"unit_name": mysql_row.get("UnitName"),
|
|
"tool_name_id": mysql_row["ToolNameID"],
|
|
"node_num": mysql_row["NodeNum"],
|
|
"event_timestamp": event_timestamp,
|
|
"state": mysql_row.get("State"),
|
|
"calc_err": mysql_row.get("calcerr", 0),
|
|
"measurements": measurements,
|
|
"created_at": mysql_row.get("created_at"),
|
|
"updated_at": mysql_row.get("updated_at"),
|
|
}
|
|
|
|
return pg_row
|
|
|
|
@staticmethod
|
|
def transform_batch(
|
|
table: str,
|
|
rows: List[Dict[str, Any]]
|
|
) -> List[Dict[str, Any]]:
|
|
"""Transform a batch of rows from MySQL to PostgreSQL format.
|
|
|
|
Args:
|
|
table: Table name ('RAWDATACOR' or 'ELABDATADISP')
|
|
rows: List of row dictionaries from MySQL
|
|
|
|
Returns:
|
|
List of transformed row dictionaries for PostgreSQL
|
|
"""
|
|
if table == "RAWDATACOR":
|
|
return [
|
|
DataTransformer.transform_rawdatacor_row(row)
|
|
for row in rows
|
|
]
|
|
elif table == "ELABDATADISP":
|
|
return [
|
|
DataTransformer.transform_elabdatadisp_row(row)
|
|
for row in rows
|
|
]
|
|
else:
|
|
raise ValueError(f"Unknown table: {table}")
|
|
|
|
@staticmethod
|
|
def get_column_order(table: str) -> List[str]:
|
|
"""Get the column order for inserting into PostgreSQL.
|
|
|
|
Args:
|
|
table: PostgreSQL table name
|
|
|
|
Returns:
|
|
List of column names in order
|
|
"""
|
|
if table == "rawdatacor":
|
|
return [
|
|
"id",
|
|
"unit_name",
|
|
"tool_name_id",
|
|
"node_num",
|
|
"event_timestamp",
|
|
"bat_level",
|
|
"temperature",
|
|
"measurements",
|
|
"created_at",
|
|
"bat_level_module",
|
|
"temperature_module",
|
|
"rssi_module",
|
|
]
|
|
elif table == "elabdatadisp":
|
|
return [
|
|
"id_elab_data",
|
|
"unit_name",
|
|
"tool_name_id",
|
|
"node_num",
|
|
"event_timestamp",
|
|
"state",
|
|
"calc_err",
|
|
"measurements",
|
|
"created_at",
|
|
"updated_at",
|
|
]
|
|
else:
|
|
raise ValueError(f"Unknown table: {table}")
|