"""Configuration management using Pydantic settings.""" from pydantic_settings import BaseSettings from pydantic import ConfigDict from typing import Optional import os class MySQLConfig(BaseSettings): """MySQL source database configuration.""" model_config = ConfigDict( env_prefix="MYSQL_", case_sensitive=False, extra="ignore", env_file=".env", env_file_encoding="utf-8" ) host: str port: int user: str password: str database: str class PostgreSQLConfig(BaseSettings): """PostgreSQL target database configuration.""" model_config = ConfigDict( env_prefix="POSTGRES_", case_sensitive=False, extra="ignore", env_file=".env", env_file_encoding="utf-8" ) host: str port: int user: str password: str database: str class MigrationSettings(BaseSettings): """Migration settings.""" model_config = ConfigDict( case_sensitive=False, extra="ignore", env_file=".env", env_file_encoding="utf-8" ) batch_size: int = 10000 consolidation_group_limit: int = 10000 log_level: str = "INFO" dry_run: bool = False progress_log_interval: int = 50000 class BenchmarkSettings(BaseSettings): """Benchmark settings.""" model_config = ConfigDict( env_prefix="BENCHMARK_", case_sensitive=False, extra="ignore", env_file=".env", env_file_encoding="utf-8" ) output_dir: str = "benchmark_results" iterations: int = 5 class Settings(BaseSettings): """All application settings.""" model_config = ConfigDict( env_file=".env", env_file_encoding="utf-8", case_sensitive=False, extra="ignore" ) mysql: MySQLConfig postgres: PostgreSQLConfig migration: MigrationSettings benchmark: BenchmarkSettings @classmethod def from_env(cls): """Load settings from environment variables.""" return cls( mysql=MySQLConfig(), postgres=PostgreSQLConfig(), migration=MigrationSettings(), benchmark=BenchmarkSettings(), ) # Lazy load settings _settings: Optional[Settings] = None def get_settings() -> Settings: """Get application settings, loading from .env if necessary.""" global _settings if _settings is None: _settings = Settings.from_env() return _settings # Schema transformation definitions RAWDATACOR_COLUMNS = { "val_columns": ["Val0", "Val1", "Val2", "Val3", "Val4", "Val5", "Val6", "Val7", "Val8", "Val9", "ValA", "ValB", "ValC", "ValD", "ValE", "ValF"], "unit_columns": ["Val0_unitmisure", "Val1_unitmisure", "Val2_unitmisure", "Val3_unitmisure", "Val4_unitmisure", "Val5_unitmisure", "Val6_unitmisure", "Val7_unitmisure", "Val8_unitmisure", "Val9_unitmisure", "ValA_unitmisure", "ValB_unitmisure", "ValC_unitmisure", "ValD_unitmisure", "ValE_unitmisure", "ValF_unitmisure"], } ELABDATADISP_MEASUREMENT_FIELDS = { "shifts": ["XShift", "YShift", "ZShift", "HShift", "HShiftDir", "HShift_local"], "coordinates": ["X", "Y", "Z", "Xstar", "Zstar"], "kinematics": ["speed", "speed_local", "acceleration", "acceleration_local"], "sensors": ["T_node", "load_value", "water_level", "pressure"], "calculated": ["AlfaX", "AlfaY", "Area"], } ELABDATADISP_FIELD_MAPPING = { # shifts mapping (source -> (category, key)) "XShift": ("shifts", "x"), "YShift": ("shifts", "y"), "ZShift": ("shifts", "z"), "HShift": ("shifts", "h"), "HShiftDir": ("shifts", "h_dir"), "HShift_local": ("shifts", "h_local"), # coordinates mapping "X": ("coordinates", "x"), "Y": ("coordinates", "y"), "Z": ("coordinates", "z"), "Xstar": ("coordinates", "x_star"), "Zstar": ("coordinates", "z_star"), # kinematics mapping "speed": ("kinematics", "speed"), "speed_local": ("kinematics", "speed_local"), "acceleration": ("kinematics", "acceleration"), "acceleration_local": ("kinematics", "acceleration_local"), # sensors mapping "T_node": ("sensors", "t_node"), "load_value": ("sensors", "load_value"), "water_level": ("sensors", "water_level"), "pressure": ("sensors", "pressure"), # calculated mapping "AlfaX": ("calculated", "alfa_x"), "AlfaY": ("calculated", "alfa_y"), "Area": ("calculated", "area"), } # PostgreSQL Partition years (from both tables) PARTITION_YEARS = list(range(2014, 2032)) # 2014-2031 # Consolidation key definition (same for both tables) # Multiple MySQL rows with same key but different NodeNum → 1 PostgreSQL row # MySQL source fields CONSOLIDATION_KEY_FIELDS = ["UnitName", "ToolNameID", "EventDate", "EventTime"] # Keys for tracking in migration_state.last_key (NOT actual PostgreSQL target columns) # Note: In PostgreSQL target, EventDate+EventTime become event_timestamp CONSOLIDATION_KEY_PG_FIELDS = ["unit_name", "tool_name_id", "event_date", "event_time"] # Table configurations - support both uppercase and lowercase keys _rawdatacor_config = { "mysql_table": "RAWDATACOR", "postgres_table": "rawdatacor", "mysql_pk": "id", # MySQL primary key "postgres_pk": "id", # PostgreSQL auto-increment primary key "mysql_max_id_field": "id", # Field to track max ID from MySQL "consolidation_key": CONSOLIDATION_KEY_FIELDS, "consolidation_key_pg": CONSOLIDATION_KEY_PG_FIELDS, } _elabdatadisp_config = { "mysql_table": "ELABDATADISP", "postgres_table": "elabdatadisp", "mysql_pk": "idElabData", # MySQL primary key "postgres_pk": "id", # PostgreSQL auto-increment primary key "mysql_max_id_field": "idElabData", # Field to track max ID from MySQL "consolidation_key": CONSOLIDATION_KEY_FIELDS, "consolidation_key_pg": CONSOLIDATION_KEY_PG_FIELDS, } TABLE_CONFIGS = { "rawdatacor": _rawdatacor_config, "RAWDATACOR": _rawdatacor_config, "elabdatadisp": _elabdatadisp_config, "ELABDATADISP": _elabdatadisp_config, } # Partition mapping utilities def year_to_partition_name(year: int, table: str) -> str: """Map year to partition name. Partition naming scheme (different for each table): - RAWDATACOR: part0=2014, part1=2015, ..., part10=2024 (part{year-2014}) - ELABDATADISP: d0=2013, d1=2014, ..., d12=2025, ..., d17=2030 (d{year-2013}) Args: year: Year (2013-2031, depending on table) table: Table name (RAWDATACOR or ELABDATADISP) Returns: Partition name (e.g., "part8" for RAWDATACOR/2022, "d14" for ELABDATADISP/2026) Raises: ValueError: If year is out of range or table is unknown """ table_upper = table.upper() if table_upper == "RAWDATACOR": # RAWDATACOR: 2014-2024 (part0-part10) # RAWDATACOR: 2025-2030 (d12-d17) if year < 2014: year = 2014 elif year > 2030: year = 2030 if year < 2025: suffix = "part" d_year = 2014 else: suffix = "d" d_year = 2013 # Continue naming as d12, d13, ... partition_index = year - d_year # 2014→0, 2015→1, ..., 2024→10 - 2025→12, ..., 2030→17 return f"{suffix}{partition_index}" elif table_upper == "ELABDATADISP": # ELABDATADISP: 2013-2031 (d0-d18) if year < 2013: year = 2013 elif year > 2031: year = 2031 partition_index = year - 2013 # 2013→0, 2014→1, ..., 2025→12, ..., 2031→18 return f"d{partition_index}" else: raise ValueError(f"Unknown table: {table}") def get_partitions_from_year(year: int, table: str) -> list[str]: """Get list of partition names from a specific year onwards. Args: year: Starting year table: Table name (RAWDATACOR or ELABDATADISP) Returns: List of partition names from that year to the latest available year Example: get_partitions_from_year(2022, "RAWDATACOR") → ["part8", "part9", "part10", "d12", "d13", "d14", "d15", "d16", "d17"] # 2022→part8, ..., 2024→part10, 2025→d12, ..., 2030→d17 get_partitions_from_year(2025, "ELABDATADISP") → ["d12", "d13", "d14", "d15", "d16", "d17"] # 2025-2030 """ table_upper = table.upper() partitions = [] if table_upper == "RAWDATACOR": end_year = 2030 # RAWDATACOR: part0-part10 (2014-2024) + d12-d17 (2025-2030) elif table_upper == "ELABDATADISP": end_year = 2030 # ELABDATADISP: d0-d17 (2013-2030) else: raise ValueError(f"Unknown table: {table}") # Generate partitions for each year from start_year to end_year for y in range(year, end_year + 1): partition_name = year_to_partition_name(y, table) # Avoid duplicates (can happen if mapping multiple years to same partition) if not partitions or partitions[-1] != partition_name: partitions.append(partition_name) return partitions def date_string_to_partition_name(date_str: str, table: str) -> str: """Extract year from date string and map to partition name. Args: date_str: Date string in format 'YYYY-MM-DD' (e.g., '2022-05-15') table: Table name (RAWDATACOR or ELABDATADISP) Returns: Partition name (e.g., "part8" or "d8") Example: date_string_to_partition_name("2022-05-15", "RAWDATACOR") → "part8" """ if not date_str or len(date_str) < 4: # Default to 2014 if invalid date return year_to_partition_name(2014, table) try: year = int(date_str[:4]) return year_to_partition_name(year, table) except (ValueError, TypeError): # Default to 2014 if can't parse return year_to_partition_name(2014, table)