- Fix get_partitions_from_year(): RAWDATACOR end_year now 2030 (was 2024) - RAWDATACOR has partitions d12-d17 for years 2025-2030, not just part0-part10 - Update year_to_partition_name() for RAWDATACOR: handle both part and d suffix - Years 2014-2024: use "part" suffix with formula (year - 2014) - Years 2025-2030: use "d" suffix with formula (year - 2013) for d12-d17 - Clamp year to range [2014, 2030] for RAWDATACOR - Update docstring examples to reflect new mapping behavior - Now correctly generates partitions like: part8, part9, part10, d12, d13, ..., d17 Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
305 lines
9.6 KiB
Python
305 lines
9.6 KiB
Python
"""Configuration management using Pydantic settings."""
|
|
from pydantic_settings import BaseSettings
|
|
from pydantic import ConfigDict
|
|
from typing import Optional
|
|
import os
|
|
|
|
|
|
class MySQLConfig(BaseSettings):
|
|
"""MySQL source database configuration."""
|
|
model_config = ConfigDict(
|
|
env_prefix="MYSQL_",
|
|
case_sensitive=False,
|
|
extra="ignore",
|
|
env_file=".env",
|
|
env_file_encoding="utf-8"
|
|
)
|
|
|
|
host: str
|
|
port: int
|
|
user: str
|
|
password: str
|
|
database: str
|
|
|
|
|
|
class PostgreSQLConfig(BaseSettings):
|
|
"""PostgreSQL target database configuration."""
|
|
model_config = ConfigDict(
|
|
env_prefix="POSTGRES_",
|
|
case_sensitive=False,
|
|
extra="ignore",
|
|
env_file=".env",
|
|
env_file_encoding="utf-8"
|
|
)
|
|
|
|
host: str
|
|
port: int
|
|
user: str
|
|
password: str
|
|
database: str
|
|
|
|
|
|
class MigrationSettings(BaseSettings):
|
|
"""Migration settings."""
|
|
model_config = ConfigDict(
|
|
case_sensitive=False,
|
|
extra="ignore",
|
|
env_file=".env",
|
|
env_file_encoding="utf-8"
|
|
)
|
|
|
|
batch_size: int = 10000
|
|
consolidation_group_limit: int = 10000
|
|
log_level: str = "INFO"
|
|
dry_run: bool = False
|
|
progress_log_interval: int = 50000
|
|
|
|
|
|
class BenchmarkSettings(BaseSettings):
|
|
"""Benchmark settings."""
|
|
model_config = ConfigDict(
|
|
env_prefix="BENCHMARK_",
|
|
case_sensitive=False,
|
|
extra="ignore",
|
|
env_file=".env",
|
|
env_file_encoding="utf-8"
|
|
)
|
|
|
|
output_dir: str = "benchmark_results"
|
|
iterations: int = 5
|
|
|
|
|
|
class Settings(BaseSettings):
|
|
"""All application settings."""
|
|
model_config = ConfigDict(
|
|
env_file=".env",
|
|
env_file_encoding="utf-8",
|
|
case_sensitive=False,
|
|
extra="ignore"
|
|
)
|
|
|
|
mysql: MySQLConfig
|
|
postgres: PostgreSQLConfig
|
|
migration: MigrationSettings
|
|
benchmark: BenchmarkSettings
|
|
|
|
@classmethod
|
|
def from_env(cls):
|
|
"""Load settings from environment variables."""
|
|
return cls(
|
|
mysql=MySQLConfig(),
|
|
postgres=PostgreSQLConfig(),
|
|
migration=MigrationSettings(),
|
|
benchmark=BenchmarkSettings(),
|
|
)
|
|
|
|
|
|
# Lazy load settings
|
|
_settings: Optional[Settings] = None
|
|
|
|
|
|
def get_settings() -> Settings:
|
|
"""Get application settings, loading from .env if necessary."""
|
|
global _settings
|
|
if _settings is None:
|
|
_settings = Settings.from_env()
|
|
return _settings
|
|
|
|
|
|
# Schema transformation definitions
|
|
RAWDATACOR_COLUMNS = {
|
|
"val_columns": ["Val0", "Val1", "Val2", "Val3", "Val4", "Val5", "Val6", "Val7", "Val8", "Val9", "ValA", "ValB", "ValC", "ValD", "ValE", "ValF"],
|
|
"unit_columns": ["Val0_unitmisure", "Val1_unitmisure", "Val2_unitmisure", "Val3_unitmisure", "Val4_unitmisure", "Val5_unitmisure", "Val6_unitmisure", "Val7_unitmisure", "Val8_unitmisure", "Val9_unitmisure", "ValA_unitmisure", "ValB_unitmisure", "ValC_unitmisure", "ValD_unitmisure", "ValE_unitmisure", "ValF_unitmisure"],
|
|
}
|
|
|
|
ELABDATADISP_MEASUREMENT_FIELDS = {
|
|
"shifts": ["XShift", "YShift", "ZShift", "HShift", "HShiftDir", "HShift_local"],
|
|
"coordinates": ["X", "Y", "Z", "Xstar", "Zstar"],
|
|
"kinematics": ["speed", "speed_local", "acceleration", "acceleration_local"],
|
|
"sensors": ["T_node", "load_value", "water_level", "pressure"],
|
|
"calculated": ["AlfaX", "AlfaY", "Area"],
|
|
}
|
|
|
|
ELABDATADISP_FIELD_MAPPING = {
|
|
# shifts mapping (source -> (category, key))
|
|
"XShift": ("shifts", "x"),
|
|
"YShift": ("shifts", "y"),
|
|
"ZShift": ("shifts", "z"),
|
|
"HShift": ("shifts", "h"),
|
|
"HShiftDir": ("shifts", "h_dir"),
|
|
"HShift_local": ("shifts", "h_local"),
|
|
# coordinates mapping
|
|
"X": ("coordinates", "x"),
|
|
"Y": ("coordinates", "y"),
|
|
"Z": ("coordinates", "z"),
|
|
"Xstar": ("coordinates", "x_star"),
|
|
"Zstar": ("coordinates", "z_star"),
|
|
# kinematics mapping
|
|
"speed": ("kinematics", "speed"),
|
|
"speed_local": ("kinematics", "speed_local"),
|
|
"acceleration": ("kinematics", "acceleration"),
|
|
"acceleration_local": ("kinematics", "acceleration_local"),
|
|
# sensors mapping
|
|
"T_node": ("sensors", "t_node"),
|
|
"load_value": ("sensors", "load_value"),
|
|
"water_level": ("sensors", "water_level"),
|
|
"pressure": ("sensors", "pressure"),
|
|
# calculated mapping
|
|
"AlfaX": ("calculated", "alfa_x"),
|
|
"AlfaY": ("calculated", "alfa_y"),
|
|
"Area": ("calculated", "area"),
|
|
}
|
|
|
|
# PostgreSQL Partition years (from both tables)
|
|
PARTITION_YEARS = list(range(2014, 2032)) # 2014-2031
|
|
|
|
# Consolidation key definition (same for both tables)
|
|
# Multiple MySQL rows with same key but different NodeNum → 1 PostgreSQL row
|
|
# MySQL source fields
|
|
CONSOLIDATION_KEY_FIELDS = ["UnitName", "ToolNameID", "EventDate", "EventTime"]
|
|
# Keys for tracking in migration_state.last_key (NOT actual PostgreSQL target columns)
|
|
# Note: In PostgreSQL target, EventDate+EventTime become event_timestamp
|
|
CONSOLIDATION_KEY_PG_FIELDS = ["unit_name", "tool_name_id", "event_date", "event_time"]
|
|
|
|
# Table configurations - support both uppercase and lowercase keys
|
|
_rawdatacor_config = {
|
|
"mysql_table": "RAWDATACOR",
|
|
"postgres_table": "rawdatacor",
|
|
"mysql_pk": "id", # MySQL primary key
|
|
"postgres_pk": "id", # PostgreSQL auto-increment primary key
|
|
"mysql_max_id_field": "id", # Field to track max ID from MySQL
|
|
"consolidation_key": CONSOLIDATION_KEY_FIELDS,
|
|
"consolidation_key_pg": CONSOLIDATION_KEY_PG_FIELDS,
|
|
}
|
|
_elabdatadisp_config = {
|
|
"mysql_table": "ELABDATADISP",
|
|
"postgres_table": "elabdatadisp",
|
|
"mysql_pk": "idElabData", # MySQL primary key
|
|
"postgres_pk": "id", # PostgreSQL auto-increment primary key
|
|
"mysql_max_id_field": "idElabData", # Field to track max ID from MySQL
|
|
"consolidation_key": CONSOLIDATION_KEY_FIELDS,
|
|
"consolidation_key_pg": CONSOLIDATION_KEY_PG_FIELDS,
|
|
}
|
|
|
|
TABLE_CONFIGS = {
|
|
"rawdatacor": _rawdatacor_config,
|
|
"RAWDATACOR": _rawdatacor_config,
|
|
"elabdatadisp": _elabdatadisp_config,
|
|
"ELABDATADISP": _elabdatadisp_config,
|
|
}
|
|
|
|
|
|
# Partition mapping utilities
|
|
def year_to_partition_name(year: int, table: str) -> str:
|
|
"""Map year to partition name.
|
|
|
|
Partition naming scheme (different for each table):
|
|
- RAWDATACOR: part0=2014, part1=2015, ..., part10=2024 (part{year-2014})
|
|
- ELABDATADISP: d0=2013, d1=2014, ..., d12=2025, ..., d17=2030 (d{year-2013})
|
|
|
|
Args:
|
|
year: Year (2013-2031, depending on table)
|
|
table: Table name (RAWDATACOR or ELABDATADISP)
|
|
|
|
Returns:
|
|
Partition name (e.g., "part8" for RAWDATACOR/2022, "d14" for ELABDATADISP/2026)
|
|
|
|
Raises:
|
|
ValueError: If year is out of range or table is unknown
|
|
"""
|
|
table_upper = table.upper()
|
|
|
|
if table_upper == "RAWDATACOR":
|
|
# RAWDATACOR: 2014-2024 (part0-part10)
|
|
# RAWDATACOR: 2025-2030 (d12-d17)
|
|
|
|
if year < 2014:
|
|
year = 2014
|
|
elif year > 2030:
|
|
year = 2030
|
|
|
|
if year < 2025:
|
|
suffix = "part"
|
|
d_year = 2014
|
|
else:
|
|
suffix = "d"
|
|
d_year = 2013 # Continue naming as d12, d13, ...
|
|
|
|
partition_index = year - d_year # 2014→0, 2015→1, ..., 2024→10 - 2025→12, ..., 2030→17
|
|
return f"{suffix}{partition_index}"
|
|
|
|
elif table_upper == "ELABDATADISP":
|
|
# ELABDATADISP: 2013-2031 (d0-d18)
|
|
if year < 2013:
|
|
year = 2013
|
|
elif year > 2031:
|
|
year = 2031
|
|
|
|
partition_index = year - 2013 # 2013→0, 2014→1, ..., 2025→12, ..., 2031→18
|
|
return f"d{partition_index}"
|
|
|
|
else:
|
|
raise ValueError(f"Unknown table: {table}")
|
|
|
|
|
|
def get_partitions_from_year(year: int, table: str) -> list[str]:
|
|
"""Get list of partition names from a specific year onwards.
|
|
|
|
Args:
|
|
year: Starting year
|
|
table: Table name (RAWDATACOR or ELABDATADISP)
|
|
|
|
Returns:
|
|
List of partition names from that year to the latest available year
|
|
|
|
Example:
|
|
get_partitions_from_year(2022, "RAWDATACOR")
|
|
→ ["part8", "part9", "part10", "d12", "d13", "d14", "d15", "d16", "d17"] # 2022→part8, ..., 2024→part10, 2025→d12, ..., 2030→d17
|
|
|
|
get_partitions_from_year(2025, "ELABDATADISP")
|
|
→ ["d12", "d13", "d14", "d15", "d16", "d17"] # 2025-2030
|
|
"""
|
|
table_upper = table.upper()
|
|
partitions = []
|
|
|
|
if table_upper == "RAWDATACOR":
|
|
end_year = 2030 # RAWDATACOR: part0-part10 (2014-2024) + d12-d17 (2025-2030)
|
|
elif table_upper == "ELABDATADISP":
|
|
end_year = 2030 # ELABDATADISP: d0-d17 (2013-2030)
|
|
else:
|
|
raise ValueError(f"Unknown table: {table}")
|
|
|
|
# Generate partitions for each year from start_year to end_year
|
|
for y in range(year, end_year + 1):
|
|
partition_name = year_to_partition_name(y, table)
|
|
# Avoid duplicates (can happen if mapping multiple years to same partition)
|
|
if not partitions or partitions[-1] != partition_name:
|
|
partitions.append(partition_name)
|
|
|
|
return partitions
|
|
|
|
|
|
def date_string_to_partition_name(date_str: str, table: str) -> str:
|
|
"""Extract year from date string and map to partition name.
|
|
|
|
Args:
|
|
date_str: Date string in format 'YYYY-MM-DD' (e.g., '2022-05-15')
|
|
table: Table name (RAWDATACOR or ELABDATADISP)
|
|
|
|
Returns:
|
|
Partition name (e.g., "part8" or "d8")
|
|
|
|
Example:
|
|
date_string_to_partition_name("2022-05-15", "RAWDATACOR") → "part8"
|
|
"""
|
|
if not date_str or len(date_str) < 4:
|
|
# Default to 2014 if invalid date
|
|
return year_to_partition_name(2014, table)
|
|
|
|
try:
|
|
year = int(date_str[:4])
|
|
return year_to_partition_name(year, table)
|
|
except (ValueError, TypeError):
|
|
# Default to 2014 if can't parse
|
|
return year_to_partition_name(2014, table)
|