mysql2postgres/config.py

"""Configuration management using Pydantic settings."""
from pydantic_settings import BaseSettings
from pydantic import ConfigDict
from typing import Optional
import os


class MySQLConfig(BaseSettings):
    """MySQL source database configuration."""
    model_config = ConfigDict(
        env_prefix="MYSQL_",
        case_sensitive=False,
        extra="ignore",
        env_file=".env",
        env_file_encoding="utf-8"
    )

    host: str
    port: int
    user: str
    password: str
    database: str


class PostgreSQLConfig(BaseSettings):
    """PostgreSQL target database configuration."""
    model_config = ConfigDict(
        env_prefix="POSTGRES_",
        case_sensitive=False,
        extra="ignore",
        env_file=".env",
        env_file_encoding="utf-8"
    )

    host: str
    port: int
    user: str
    password: str
    database: str


class MigrationSettings(BaseSettings):
    """Migration settings."""
    model_config = ConfigDict(
        case_sensitive=False,
        extra="ignore",
        env_file=".env",
        env_file_encoding="utf-8"
    )

    batch_size: int = 10000
    consolidation_group_limit: int = 10000
    log_level: str = "INFO"
    dry_run: bool = False
    progress_log_interval: int = 50000


class BenchmarkSettings(BaseSettings):
    """Benchmark settings."""
    model_config = ConfigDict(
        env_prefix="BENCHMARK_",
        case_sensitive=False,
        extra="ignore",
        env_file=".env",
        env_file_encoding="utf-8"
    )

    output_dir: str = "benchmark_results"
    iterations: int = 5


class Settings(BaseSettings):
    """All application settings."""
    model_config = ConfigDict(
        env_file=".env",
        env_file_encoding="utf-8",
        case_sensitive=False,
        extra="ignore"
    )

    mysql: MySQLConfig
    postgres: PostgreSQLConfig
    migration: MigrationSettings
    benchmark: BenchmarkSettings

    @classmethod
    def from_env(cls):
        """Load settings from environment variables."""
        return cls(
            mysql=MySQLConfig(),
            postgres=PostgreSQLConfig(),
            migration=MigrationSettings(),
            benchmark=BenchmarkSettings(),
        )


# Lazy load settings
_settings: Optional[Settings] = None


def get_settings() -> Settings:
    """Get application settings, loading from .env if necessary."""
    global _settings
    if _settings is None:
        _settings = Settings.from_env()
    return _settings


# Schema transformation definitions
RAWDATACOR_COLUMNS = {
    "val_columns": ["Val0", "Val1", "Val2", "Val3", "Val4", "Val5", "Val6", "Val7", "Val8", "Val9", "ValA", "ValB", "ValC", "ValD", "ValE", "ValF"],
    "unit_columns": ["Val0_unitmisure", "Val1_unitmisure", "Val2_unitmisure", "Val3_unitmisure", "Val4_unitmisure", "Val5_unitmisure", "Val6_unitmisure", "Val7_unitmisure", "Val8_unitmisure", "Val9_unitmisure", "ValA_unitmisure", "ValB_unitmisure", "ValC_unitmisure", "ValD_unitmisure", "ValE_unitmisure", "ValF_unitmisure"],
}

ELABDATADISP_MEASUREMENT_FIELDS = {
    "shifts": ["XShift", "YShift", "ZShift", "HShift", "HShiftDir", "HShift_local"],
    "coordinates": ["X", "Y", "Z", "Xstar", "Zstar"],
    "kinematics": ["speed", "speed_local", "acceleration", "acceleration_local"],
    "sensors": ["T_node", "load_value", "water_level", "pressure"],
    "calculated": ["AlfaX", "AlfaY", "Area"],
}

ELABDATADISP_FIELD_MAPPING = {
    # shifts mapping (source -> (category, key))
    "XShift": ("shifts", "x"),
    "YShift": ("shifts", "y"),
    "ZShift": ("shifts", "z"),
    "HShift": ("shifts", "h"),
    "HShiftDir": ("shifts", "h_dir"),
    "HShift_local": ("shifts", "h_local"),
    # coordinates mapping
    "X": ("coordinates", "x"),
    "Y": ("coordinates", "y"),
    "Z": ("coordinates", "z"),
    "Xstar": ("coordinates", "x_star"),
    "Zstar": ("coordinates", "z_star"),
    # kinematics mapping
    "speed": ("kinematics", "speed"),
    "speed_local": ("kinematics", "speed_local"),
    "acceleration": ("kinematics", "acceleration"),
    "acceleration_local": ("kinematics", "acceleration_local"),
    # sensors mapping
    "T_node": ("sensors", "t_node"),
    "load_value": ("sensors", "load_value"),
    "water_level": ("sensors", "water_level"),
    "pressure": ("sensors", "pressure"),
    # calculated mapping
    "AlfaX": ("calculated", "alfa_x"),
    "AlfaY": ("calculated", "alfa_y"),
    "Area": ("calculated", "area"),
}

# PostgreSQL Partition years (from both tables)
PARTITION_YEARS = list(range(2014, 2032))  # 2014-2031

# Consolidation key definition (same for both tables)
# Multiple MySQL rows with same key but different NodeNum → 1 PostgreSQL row
# MySQL source fields
CONSOLIDATION_KEY_FIELDS = ["UnitName", "ToolNameID", "EventDate", "EventTime"]
# Keys for tracking in migration_state.last_key (NOT actual PostgreSQL target columns)
# Note: In PostgreSQL target, EventDate+EventTime become event_timestamp
CONSOLIDATION_KEY_PG_FIELDS = ["unit_name", "tool_name_id", "event_date", "event_time"]

# Table configurations - support both uppercase and lowercase keys
_rawdatacor_config = {
    "mysql_table": "RAWDATACOR",
    "postgres_table": "rawdatacor",
    "mysql_pk": "id",  # MySQL primary key
    "postgres_pk": "id",  # PostgreSQL auto-increment primary key
    "mysql_max_id_field": "id",  # Field to track max ID from MySQL
    "consolidation_key": CONSOLIDATION_KEY_FIELDS,
    "consolidation_key_pg": CONSOLIDATION_KEY_PG_FIELDS,
}
_elabdatadisp_config = {
    "mysql_table": "ELABDATADISP",
    "postgres_table": "elabdatadisp",
    "mysql_pk": "idElabData",  # MySQL primary key
    "postgres_pk": "id",  # PostgreSQL auto-increment primary key
    "mysql_max_id_field": "idElabData",  # Field to track max ID from MySQL
    "consolidation_key": CONSOLIDATION_KEY_FIELDS,
    "consolidation_key_pg": CONSOLIDATION_KEY_PG_FIELDS,
}

TABLE_CONFIGS = {
    "rawdatacor": _rawdatacor_config,
    "RAWDATACOR": _rawdatacor_config,
    "elabdatadisp": _elabdatadisp_config,
    "ELABDATADISP": _elabdatadisp_config,
}


# Partition mapping utilities
def year_to_partition_name(year: int, table: str) -> str:
    """Map year to partition name.

    Partition naming scheme (different for each table):
    - RAWDATACOR: part0=2014, part1=2015, ..., part10=2024 (part{year-2014})
    - ELABDATADISP: d0=2013, d1=2014, ..., d12=2025, ..., d17=2030 (d{year-2013})

    Args:
        year: Year (2013-2031, depending on table)
        table: Table name (RAWDATACOR or ELABDATADISP)

    Returns:
        Partition name (e.g., "part8" for RAWDATACOR/2022, "d14" for ELABDATADISP/2026)

    Raises:
        ValueError: If year is out of range or table is unknown
    """
    table_upper = table.upper()

    if table_upper == "RAWDATACOR":
        # RAWDATACOR: 2014-2024 (part0-part10)
        # RAWDATACOR: 2025-2030 (d12-d17)

        if year < 2014:
            year = 2014
        elif year > 2030:
            year = 2030

        if year < 2025:
            suffix = "part"
            d_year = 2014
        else:
            suffix = "d"
            d_year = 2013  # Continue naming as d12, d13, ...

        partition_index = year - d_year  # 2014→0, 2015→1, ..., 2024→10 - 2025→12, ..., 2030→17
        return f"{suffix}{partition_index}"

    elif table_upper == "ELABDATADISP":
        # ELABDATADISP: 2013-2031 (d0-d18)
        if year < 2013:
            year = 2013
        elif year > 2031:
            year = 2031

        partition_index = year - 2013  # 2013→0, 2014→1, ..., 2025→12, ..., 2031→18
        return f"d{partition_index}"

    else:
        raise ValueError(f"Unknown table: {table}")


def get_partitions_from_year(year: int, table: str) -> list[str]:
    """Get list of partition names from a specific year onwards.

    Args:
        year: Starting year
        table: Table name (RAWDATACOR or ELABDATADISP)

    Returns:
        List of partition names from that year to the latest available year

    Example:
        get_partitions_from_year(2022, "RAWDATACOR")
        → ["part8", "part9", "part10", "d12", "d13", "d14", "d15", "d16", "d17"]  # 2022→part8, ..., 2024→part10, 2025→d12, ..., 2030→d17

        get_partitions_from_year(2025, "ELABDATADISP")
        → ["d12", "d13", "d14", "d15", "d16", "d17"]  # 2025-2030
    """
    table_upper = table.upper()
    partitions = []

    if table_upper == "RAWDATACOR":
        end_year = 2030  # RAWDATACOR: part0-part10 (2014-2024) + d12-d17 (2025-2030)
    elif table_upper == "ELABDATADISP":
        end_year = 2030  # ELABDATADISP: d0-d17 (2013-2030)
    else:
        raise ValueError(f"Unknown table: {table}")

    # Generate partitions for each year from start_year to end_year
    for y in range(year, end_year + 1):
        partition_name = year_to_partition_name(y, table)
        # Avoid duplicates (can happen if mapping multiple years to same partition)
        if not partitions or partitions[-1] != partition_name:
            partitions.append(partition_name)

    return partitions


def date_string_to_partition_name(date_str: str, table: str) -> str:
    """Extract year from date string and map to partition name.

    Args:
        date_str: Date string in format 'YYYY-MM-DD' (e.g., '2022-05-15')
        table: Table name (RAWDATACOR or ELABDATADISP)

    Returns:
        Partition name (e.g., "part8" or "d8")

    Example:
        date_string_to_partition_name("2022-05-15", "RAWDATACOR") → "part8"
    """
    if not date_str or len(date_str) < 4:
        # Default to 2014 if invalid date
        return year_to_partition_name(2014, table)

    try:
        year = int(date_str[:4])
        return year_to_partition_name(year, table)
    except (ValueError, TypeError):
        # Default to 2014 if can't parse
        return year_to_partition_name(2014, table)