Files
mysql2postgres/config.py
alex 1430ef206f fix: Ensure complete node consolidation by ordering MySQL query by consolidation key
Root cause: Nodes 1-11 had IDs in 132M+ range while nodes 12-22 had IDs in 298-308
range, causing them to be fetched in batches thousands apart using keyset pagination
by ID. This meant they arrived as separate groups and were never unified into a
single consolidated row.

Solution: Order MySQL query by (UnitName, ToolNameID, EventDate, EventTime) instead
of by ID. This guarantees all rows for the same consolidation key arrive together,
ensuring they are grouped and consolidated into a single row with JSONB measurements
keyed by node number.

Changes:
- fetch_consolidation_groups_from_partition(): Changed from keyset pagination by ID
  to ORDER BY consolidation key. Simplify grouping logic since ORDER BY already ensures
  consecutive rows have same key.
- full_migration.py: Add cleanup of partial partitions on resume. When resuming and a
  partition was started but not completed, delete its incomplete data before
  re-processing to avoid duplicates. Also recalculate total_rows_migrated from actual
  database count.
- config.py: Add postgres_pk field to TABLE_CONFIGS to specify correct primary key
  column names in PostgreSQL (id vs id_elab_data).
- Cleanup: Remove temporary test scripts used during debugging

Performance note: ORDER BY consolidation key requires index for speed. Index
(UnitName, ToolNameID, EventDate, EventTime) created with ALGORITHM=INPLACE
LOCK=NONE to avoid blocking reads.

🤖 Generated with Claude Code

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-26 18:22:23 +01:00

177 lines
5.1 KiB
Python

"""Configuration management using Pydantic settings."""
from pydantic_settings import BaseSettings
from pydantic import ConfigDict
from typing import Optional
import os
class MySQLConfig(BaseSettings):
"""MySQL source database configuration."""
model_config = ConfigDict(
env_prefix="MYSQL_",
case_sensitive=False,
extra="ignore",
env_file=".env",
env_file_encoding="utf-8"
)
host: str
port: int
user: str
password: str
database: str
class PostgreSQLConfig(BaseSettings):
"""PostgreSQL target database configuration."""
model_config = ConfigDict(
env_prefix="POSTGRES_",
case_sensitive=False,
extra="ignore",
env_file=".env",
env_file_encoding="utf-8"
)
host: str
port: int
user: str
password: str
database: str
class MigrationSettings(BaseSettings):
"""Migration settings."""
model_config = ConfigDict(
case_sensitive=False,
extra="ignore",
env_file=".env",
env_file_encoding="utf-8"
)
batch_size: int = 10000
consolidation_group_limit: int = 10000
log_level: str = "INFO"
dry_run: bool = False
class BenchmarkSettings(BaseSettings):
"""Benchmark settings."""
model_config = ConfigDict(
env_prefix="BENCHMARK_",
case_sensitive=False,
extra="ignore",
env_file=".env",
env_file_encoding="utf-8"
)
output_dir: str = "benchmark_results"
iterations: int = 5
class Settings(BaseSettings):
"""All application settings."""
model_config = ConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=False,
extra="ignore"
)
mysql: MySQLConfig
postgres: PostgreSQLConfig
migration: MigrationSettings
benchmark: BenchmarkSettings
@classmethod
def from_env(cls):
"""Load settings from environment variables."""
return cls(
mysql=MySQLConfig(),
postgres=PostgreSQLConfig(),
migration=MigrationSettings(),
benchmark=BenchmarkSettings(),
)
# Lazy load settings
_settings: Optional[Settings] = None
def get_settings() -> Settings:
"""Get application settings, loading from .env if necessary."""
global _settings
if _settings is None:
_settings = Settings.from_env()
return _settings
# Schema transformation definitions
RAWDATACOR_COLUMNS = {
"val_columns": ["Val0", "Val1", "Val2", "Val3", "Val4", "Val5", "Val6", "Val7", "Val8", "Val9", "ValA", "ValB", "ValC", "ValD", "ValE", "ValF"],
"unit_columns": ["Val0_unitmisure", "Val1_unitmisure", "Val2_unitmisure", "Val3_unitmisure", "Val4_unitmisure", "Val5_unitmisure", "Val6_unitmisure", "Val7_unitmisure", "Val8_unitmisure", "Val9_unitmisure", "ValA_unitmisure", "ValB_unitmisure", "ValC_unitmisure", "ValD_unitmisure", "ValE_unitmisure", "ValF_unitmisure"],
}
ELABDATADISP_MEASUREMENT_FIELDS = {
"shifts": ["XShift", "YShift", "ZShift", "HShift", "HShiftDir", "HShift_local"],
"coordinates": ["X", "Y", "Z", "Xstar", "Zstar"],
"kinematics": ["speed", "speed_local", "acceleration", "acceleration_local"],
"sensors": ["T_node", "load_value", "water_level", "pressure"],
"calculated": ["AlfaX", "AlfaY", "Area"],
}
ELABDATADISP_FIELD_MAPPING = {
# shifts mapping (source -> (category, key))
"XShift": ("shifts", "x"),
"YShift": ("shifts", "y"),
"ZShift": ("shifts", "z"),
"HShift": ("shifts", "h"),
"HShiftDir": ("shifts", "h_dir"),
"HShift_local": ("shifts", "h_local"),
# coordinates mapping
"X": ("coordinates", "x"),
"Y": ("coordinates", "y"),
"Z": ("coordinates", "z"),
"Xstar": ("coordinates", "x_star"),
"Zstar": ("coordinates", "z_star"),
# kinematics mapping
"speed": ("kinematics", "speed"),
"speed_local": ("kinematics", "speed_local"),
"acceleration": ("kinematics", "acceleration"),
"acceleration_local": ("kinematics", "acceleration_local"),
# sensors mapping
"T_node": ("sensors", "t_node"),
"load_value": ("sensors", "load_value"),
"water_level": ("sensors", "water_level"),
"pressure": ("sensors", "pressure"),
# calculated mapping
"AlfaX": ("calculated", "alfa_x"),
"AlfaY": ("calculated", "alfa_y"),
"Area": ("calculated", "area"),
}
# PostgreSQL Partition years (from both tables)
PARTITION_YEARS = list(range(2014, 2032)) # 2014-2031
# Table configurations - support both uppercase and lowercase keys
_rawdatacor_config = {
"mysql_table": "RAWDATACOR",
"postgres_table": "rawdatacor",
"primary_key": "id",
"postgres_pk": "id", # Primary key column name in PostgreSQL
"partition_key": "event_timestamp",
}
_elabdatadisp_config = {
"mysql_table": "ELABDATADISP",
"postgres_table": "elabdatadisp",
"primary_key": "idElabData",
"postgres_pk": "id_elab_data", # Primary key column name in PostgreSQL
"partition_key": "event_timestamp",
}
TABLE_CONFIGS = {
"rawdatacor": _rawdatacor_config,
"RAWDATACOR": _rawdatacor_config,
"elabdatadisp": _elabdatadisp_config,
"ELABDATADISP": _elabdatadisp_config,
}