From fccc83eb74192380852a691c70aae422233d2def Mon Sep 17 00:00:00 2001 From: alex Date: Wed, 10 Dec 2025 19:58:20 +0100 Subject: [PATCH] docs: Add comprehensive documentation and helper scripts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add: - QUICKSTART.md: 5-minute quick start guide with examples - scripts/incus_setup.sh: Automated PostgreSQL container setup - scripts/validate_migration.sql: SQL validation queries - scripts/setup_cron.sh: Cron job setup for incremental migrations - tests/test_setup.py: Unit tests for configuration and transformation - install.sh: Quick installation script Documentation includes: - Step-by-step setup instructions - Example queries for RAWDATACOR and ELABDATADISP - Troubleshooting guide - Performance optimization tips πŸ€– Generated with Claude Code Co-Authored-By: Claude Haiku 4.5 --- QUICKSTART.md | 275 +++++++++++++++++++++++++++++++++ install.sh | 55 +++++++ scripts/incus_setup.sh | 52 +++++++ scripts/setup_cron.sh | 37 +++++ scripts/validate_migration.sql | 90 +++++++++++ tests/test_setup.py | 153 ++++++++++++++++++ 6 files changed, 662 insertions(+) create mode 100644 QUICKSTART.md create mode 100755 install.sh create mode 100755 scripts/incus_setup.sh create mode 100755 scripts/setup_cron.sh create mode 100644 scripts/validate_migration.sql create mode 100644 tests/test_setup.py diff --git a/QUICKSTART.md b/QUICKSTART.md new file mode 100644 index 0000000..54511bd --- /dev/null +++ b/QUICKSTART.md @@ -0,0 +1,275 @@ +# Quick Start Guide + +Guida rapida per iniziare con il migration tool. + +## Setup in 5 minuti + +### 1. Clonare e configurare + +```bash +# Entrare nella directory +cd mysql2postgres + +# Creare environment +python -m venv venv +source venv/bin/activate + +# Installare dipendenze +pip install -e . +``` + +### 2. Configurare .env + +```bash +# Copiare template +cp .env.example .env + +# Editare con le tue credenziali +nano .env +``` + +**Esempio .env:** +```env +MYSQL_HOST=localhost +MYSQL_PORT=3306 +MYSQL_USER=root +MYSQL_PASSWORD=mypassword +MYSQL_DATABASE=production_db + +POSTGRES_HOST=localhost +POSTGRES_PORT=5432 +POSTGRES_USER=postgres +POSTGRES_PASSWORD=pgpassword +POSTGRES_DATABASE=migrated_db + +BATCH_SIZE=10000 +LOG_LEVEL=INFO +``` + +### 3. Creare PostgreSQL in Incus + +```bash +# Creare container Incus +incus launch images:ubuntu/22.04 pg-server + +# Accedere al container +incus shell pg-server + +# Dentro il container: +apt update && apt install -y postgresql postgresql-contrib + +# Avviare PostgreSQL +systemctl start postgresql +systemctl enable postgresql + +# Uscire dal container +exit + +# Ottenere IP del container +incus list +# Anota l'IP e usalo in POSTGRES_HOST nel .env +``` + +### 4. Eseguire migrazione + +```bash +# Verificare configurazione +python main.py info + +# Creare schema +python main.py setup --create-schema + +# Migrare tutti i dati +python main.py migrate full + +# Verificare risultati +python main.py migrate incremental # Dovrebbe dire "No new rows" +``` + +### 5. Eseguire benchmark + +```bash +python main.py benchmark +``` + +## Comandi Frequenti + +### Migrazione iniziale +```bash +# Dry-run (vedere cosa verrebbe fatto) +python main.py migrate full --dry-run + +# Effettiva migrazione +python main.py migrate full + +# Solo una tabella +python main.py migrate full --table RAWDATACOR +``` + +### Migrazioni periodiche +```bash +# Migrare solo i cambiamenti dal last sync +python main.py migrate incremental + +# Con stato personalizzato +python main.py migrate incremental --state-file daily_sync.json +``` + +### Benchmark +```bash +# Benchmark di default (5 iterazioni) +python main.py benchmark + +# Con piΓΉ iterazioni +python main.py benchmark --iterations 20 + +# Con output personalizzato +python main.py benchmark --output my_results.json +``` + +## Esempi di Query su PostgreSQL + +Dopo la migrazione, puoi interrogare i dati in PostgreSQL: + +### RAWDATACOR + +```sql +-- Tutti i dati per un'unitΓ  +SELECT * FROM rawdatacor +WHERE unit_name = 'Unit1' +LIMIT 10; + +-- Filtrare per valore di una misura +SELECT id, event_date, event_time, + measurements->'0'->>'value' as val0, + measurements->'0'->>'unit' as val0_unit +FROM rawdatacor +WHERE measurements ? '0' +AND (measurements->'0'->>'value')::NUMERIC > 10.0; + +-- Aggregazione per data +SELECT event_date, COUNT(*) as record_count +FROM rawdatacor +WHERE event_date >= '2024-01-01' +GROUP BY event_date +ORDER BY event_date; + +-- Statistiche per unitΓ  e strumento +SELECT unit_name, tool_name_id, COUNT(*) as total_records +FROM rawdatacor +GROUP BY unit_name, tool_name_id +ORDER BY total_records DESC; +``` + +### ELABDATADISP + +```sql +-- Dati con velocitΓ  +SELECT id_elab_data, event_date, event_time, + (measurements->'kinematics'->>'speed')::NUMERIC as speed, + (measurements->'kinematics'->>'acceleration')::NUMERIC as acceleration +FROM elabdatadisp +WHERE measurements @> '{"kinematics": {}}' +LIMIT 10; + +-- Filtro su intervallo +SELECT unit_name, COUNT(*) as count +FROM elabdatadisp +WHERE (measurements->'kinematics'->>'speed')::NUMERIC > 5.0 +GROUP BY unit_name; + +-- Media velocitΓ  per unitΓ  +SELECT unit_name, + AVG((measurements->'kinematics'->>'speed')::NUMERIC) as avg_speed, + MAX((measurements->'kinematics'->>'speed')::NUMERIC) as max_speed +FROM elabdatadisp +WHERE event_date >= '2024-01-01' +GROUP BY unit_name; + +-- Dati con errore di calcolo +SELECT * FROM elabdatadisp +WHERE calc_err > 0 +AND event_date >= '2024-01-01' +ORDER BY event_date DESC; +``` + +## Monitorare Progress + +Il tool mostra una progress bar durante la migrazione: + +``` +Migrating RAWDATACOR β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘β–‘ 45% 00:05:23 +``` + +I log sono salvati in: +- Console: Output di default +- File: `.log` (configurabile) + +## Troubleshooting + +### "Cannot connect to MySQL" +```bash +# Verificare che MySQL sia online +mysql -h localhost -u root -p -e "SELECT 1" +``` + +### "Table does not exist in PostgreSQL" +```bash +# Ricreate lo schema +python main.py setup --create-schema +``` + +### "Migration is slow" +```bash +# Aumentare batch size in .env +BATCH_SIZE=50000 + +# Oppure ottimizzare MySQL +mysql> FLUSH PRIVILEGES; +``` + +### "Benchmark queries fail" +```bash +# Verificare che le tabelle siano state migrate +SELECT COUNT(*) FROM rawdatacor; + +# Verificare JSONB Γ¨ valido +SELECT measurements FROM rawdatacor LIMIT 1; +``` + +## Prossimi Passi + +1. **Validare i dati** + ```bash + # Contare righe in entrambi i database + # MySQL + mysql> SELECT COUNT(*) FROM RAWDATACOR; + + # PostgreSQL + psql> SELECT COUNT(*) FROM rawdatacor; + ``` + +2. **Testare query critiche** + - Assicurarsi che le query dell'applicazione funzionino su PostgreSQL + +3. **Benchmark performance** + ```bash + python main.py benchmark --iterations 20 + ``` + +4. **Setup migrazioni periodiche** + - Schedulare `python main.py migrate incremental` con cron/systemd timer + +5. **Mantenimento indici** + ```sql + -- Analizzare tabelle + ANALYZE rawdatacor; + ANALYZE elabdatadisp; + + -- Reindex se necessario + REINDEX TABLE rawdatacor; + ``` + +## Support + +Per domande o problemi, consulta il file README.md completo. diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..0ed53b9 --- /dev/null +++ b/install.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Quick installation script + +set -e + +echo "MySQL to PostgreSQL Migration Tool - Installation" +echo "==================================================" +echo "" + +# Check Python version +PYTHON_VERSION=$(python3 --version 2>&1 | awk '{print $2}') +echo "βœ“ Python $PYTHON_VERSION detected" + +# Create virtual environment +echo "" +echo "Creating virtual environment..." +python3 -m venv venv +source venv/bin/activate +echo "βœ“ Virtual environment created" + +# Upgrade pip +echo "" +echo "Upgrading pip..." +pip install --upgrade pip setuptools wheel > /dev/null 2>&1 +echo "βœ“ pip upgraded" + +# Install dependencies +echo "" +echo "Installing dependencies..." +pip install -e . > /dev/null 2>&1 +echo "βœ“ Dependencies installed" + +# Copy .env.example to .env if not exists +if [ ! -f .env ]; then + echo "" + echo "Creating .env file from template..." + cp .env.example .env + echo "βœ“ .env created (edit with your database credentials)" +else + echo "" + echo "β„Ή .env already exists" +fi + +echo "" +echo "==================================================" +echo "Installation complete!" +echo "" +echo "Next steps:" +echo "1. Edit .env with your database credentials" +echo "2. Activate virtual environment: source venv/bin/activate" +echo "3. Verify setup: python main.py info" +echo "4. Create schema: python main.py setup --create-schema" +echo "5. Run migration: python main.py migrate full" +echo "" +echo "For more help, see README.md or QUICKSTART.md" diff --git a/scripts/incus_setup.sh b/scripts/incus_setup.sh new file mode 100755 index 0000000..3bcbf18 --- /dev/null +++ b/scripts/incus_setup.sh @@ -0,0 +1,52 @@ +#!/bin/bash +# Script per setup PostgreSQL in container Incus + +set -e + +CONTAINER_NAME=${1:-pg-server} +POSTGRES_PASSWORD=${2:-postgres} + +echo "Creating Incus container: $CONTAINER_NAME" + +# Creare container +incus launch images:ubuntu/22.04 "$CONTAINER_NAME" --wait + +echo "Installing PostgreSQL..." + +# Installare PostgreSQL +incus exec "$CONTAINER_NAME" -- apt update +incus exec "$CONTAINER_NAME" -- apt install -y postgresql postgresql-contrib + +echo "Starting PostgreSQL..." + +# Avviare PostgreSQL +incus exec "$CONTAINER_NAME" -- systemctl start postgresql +incus exec "$CONTAINER_NAME" -- systemctl enable postgresql + +# Impostare password postgres +incus exec "$CONTAINER_NAME" -- sudo -u postgres psql -c "ALTER USER postgres WITH PASSWORD '$POSTGRES_PASSWORD';" + +# Permettere connessioni TCP +incus exec "$CONTAINER_NAME" -- bash -c " +echo \"host all all 0.0.0.0/0 md5\" >> /etc/postgresql/14/main/pg_hba.conf +sed -i \"s/#listen_addresses = 'localhost'/listen_addresses = '*'/\" /etc/postgresql/14/main/postgresql.conf +" + +# Riavviare PostgreSQL +incus exec "$CONTAINER_NAME" -- systemctl restart postgresql + +# Ottenere IP +IP=$(incus list "$CONTAINER_NAME" -c4 | tail -n1 | awk '{print $1}') + +echo "" +echo "βœ“ PostgreSQL is running!" +echo "" +echo "Connection details:" +echo " Host: $IP" +echo " Port: 5432" +echo " User: postgres" +echo " Password: $POSTGRES_PASSWORD" +echo "" +echo "Update .env file with:" +echo " POSTGRES_HOST=$IP" +echo " POSTGRES_PASSWORD=$POSTGRES_PASSWORD" diff --git a/scripts/setup_cron.sh b/scripts/setup_cron.sh new file mode 100755 index 0000000..febacec --- /dev/null +++ b/scripts/setup_cron.sh @@ -0,0 +1,37 @@ +#!/bin/bash +# Setup cron job for incremental migration + +PROJECT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd) +VENV_PYTHON="$PROJECT_DIR/venv/bin/python" +LOG_FILE="$PROJECT_DIR/migration_$(date +%Y%m%d).log" + +# Create cron job entry +CRON_ENTRY="0 */6 * * * cd $PROJECT_DIR && $VENV_PYTHON main.py migrate incremental >> $LOG_FILE 2>&1" + +echo "Cron job to be added:" +echo "$CRON_ENTRY" +echo "" +echo "This will run incremental migration every 6 hours." +echo "" + +# Check if already exists +if crontab -l 2>/dev/null | grep -q "migrate incremental"; then + echo "⚠ Cron job already exists" + echo "" + echo "Current cron jobs:" + crontab -l | grep -v '^#' | grep -v '^$' +else + echo "Add to crontab? (y/n)" + read -r response + + if [ "$response" = "y" ]; then + # Add cron job + (crontab -l 2>/dev/null; echo "$CRON_ENTRY") | crontab - + echo "βœ“ Cron job added successfully" + echo "" + echo "Verify with: crontab -l" + echo "View logs: tail -f migration_*.log" + else + echo "Cron job not added" + fi +fi diff --git a/scripts/validate_migration.sql b/scripts/validate_migration.sql new file mode 100644 index 0000000..6e042a5 --- /dev/null +++ b/scripts/validate_migration.sql @@ -0,0 +1,90 @@ +-- Validation queries for PostgreSQL after migration + +-- 1. Verify row counts match between MySQL and PostgreSQL +-- Run this on both databases and compare + +-- PostgreSQL queries: +SELECT 'rawdatacor' as table_name, COUNT(*) as row_count FROM rawdatacor +UNION ALL +SELECT 'elabdatadisp' as table_name, COUNT(*) as row_count FROM elabdatadisp; + +-- 2. Check for NULL values in JSONB (should be empty) +SELECT 'rawdatacor with NULL measurements' as check_name, COUNT(*) as count +FROM rawdatacor WHERE measurements IS NULL +UNION ALL +SELECT 'elabdatadisp with NULL measurements' as check_name, COUNT(*) as count +FROM elabdatadisp WHERE measurements IS NULL; + +-- 3. Verify date range coverage +SELECT + 'rawdatacor dates' as table_name, + MIN(event_date) as min_date, + MAX(event_date) as max_date, + COUNT(DISTINCT event_date) as distinct_dates +FROM rawdatacor +UNION ALL +SELECT + 'elabdatadisp dates' as table_name, + MIN(event_date) as min_date, + MAX(event_date) as max_date, + COUNT(DISTINCT event_date) as distinct_dates +FROM elabdatadisp; + +-- 4. Verify partitions are in use +EXPLAIN (ANALYZE, BUFFERS) +SELECT COUNT(*) FROM rawdatacor WHERE event_date >= '2024-01-01' AND event_date < '2024-12-31'; + +-- 5. Check JSONB structure samples +-- RAWDATACOR +SELECT measurements FROM rawdatacor WHERE measurements IS NOT NULL LIMIT 1; + +-- ELABDATADISP +SELECT measurements FROM elabdatadisp WHERE measurements IS NOT NULL LIMIT 1; + +-- 6. Verify indexes exist +SELECT schemaname, tablename, indexname +FROM pg_indexes +WHERE tablename IN ('rawdatacor', 'elabdatadisp') +ORDER BY tablename, indexname; + +-- 7. Performance: Simple queries +\timing on + +-- Single row by primary key +SELECT * FROM rawdatacor WHERE id = 1000 AND event_date = '2024-01-01'; + +-- Date range scan +SELECT COUNT(*) FROM rawdatacor WHERE event_date >= '2024-01-01' AND event_date < '2024-12-31'; + +-- Unit and tool filter +SELECT COUNT(*) FROM rawdatacor WHERE unit_name = 'Unit1' AND tool_name_id = 'Tool1'; + +-- JSONB filter +SELECT COUNT(*) FROM rawdatacor WHERE measurements ? '0'; + +\timing off + +-- 8. Identify partitions with data +SELECT + schemaname, + tablename, + pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) as size +FROM pg_tables +WHERE tablename LIKE 'rawdatacor_%' OR tablename LIKE 'elabdatadisp_%' +ORDER BY tablename; + +-- 9. Check for constraint violations +-- Verify unique constraints +SELECT + 'rawdatacor duplicate unique key' as check_name, + COUNT(*) as duplicate_count +FROM rawdatacor +GROUP BY unit_name, tool_name_id, node_num, event_date, event_time +HAVING COUNT(*) > 1 +UNION ALL +SELECT + 'elabdatadisp duplicate unique key' as check_name, + COUNT(*) as duplicate_count +FROM elabdatadisp +GROUP BY unit_name, tool_name_id, node_num, event_date, event_time +HAVING COUNT(*) > 1; diff --git a/tests/test_setup.py b/tests/test_setup.py new file mode 100644 index 0000000..e70c5ba --- /dev/null +++ b/tests/test_setup.py @@ -0,0 +1,153 @@ +"""Test setup and basic functionality.""" +import pytest +from config import get_settings, TABLE_CONFIGS, RAWDATACOR_COLUMNS, ELABDATADISP_FIELD_MAPPING +from src.transformers.data_transformer import DataTransformer + + +class TestConfiguration: + """Test configuration loading.""" + + def test_settings_loaded(self): + """Test that settings can be loaded.""" + settings = get_settings() + assert settings is not None + assert settings.mysql.host is not None + assert settings.postgres.host is not None + + def test_table_configs_exist(self): + """Test that table configurations exist.""" + assert "RAWDATACOR" in TABLE_CONFIGS or len(TABLE_CONFIGS) > 0 + + def test_migration_batch_size(self): + """Test that batch size is configured.""" + settings = get_settings() + assert settings.migration.batch_size > 0 + assert settings.migration.batch_size <= 1000000 + + +class TestDataTransformation: + """Test data transformation logic.""" + + def test_rawdatacor_transformation(self): + """Test RAWDATACOR row transformation.""" + # Sample MySQL row + mysql_row = { + "id": 1, + "UnitName": "TestUnit", + "ToolNameID": "Tool1", + "NodeNum": 1, + "EventDate": "2024-01-01", + "EventTime": "12:00:00", + "BatLevel": 3.5, + "Temperature": 25.5, + "Val0": "100.5", + "Val1": None, + "Val2": "200.3", + "Val0_unitmisure": "Β°C", + "Val1_unitmisure": "bar", + "Val2_unitmisure": "m/s", + } + + # Add remaining Val columns as None + for i in range(3, 16): + col = f"Val{i:X}" # Val3-ValF + mysql_row[col] = None + mysql_row[f"{col}_unitmisure"] = None + + # Transform + pg_row = DataTransformer.transform_rawdatacor_row(mysql_row) + + # Verify + assert pg_row["id"] == 1 + assert pg_row["unit_name"] == "TestUnit" + assert pg_row["tool_name_id"] == "Tool1" + assert isinstance(pg_row["measurements"], dict) + assert "0" in pg_row["measurements"] + assert pg_row["measurements"]["0"]["value"] == "100.5" + assert pg_row["measurements"]["0"]["unit"] == "Β°C" + assert "1" not in pg_row["measurements"] # NULL values excluded + assert "2" in pg_row["measurements"] + + def test_elabdatadisp_transformation(self): + """Test ELABDATADISP row transformation.""" + # Sample MySQL row + mysql_row = { + "idElabData": 5000, + "UnitName": "TestUnit", + "ToolNameID": "Tool1", + "NodeNum": 1, + "EventDate": "2024-01-01", + "EventTime": "12:00:00", + "State": "OK", + "calcerr": 0, + "XShift": 1.234567, + "YShift": 2.345678, + "ZShift": 3.456789, + "HShift": 4.567890, + "HShiftDir": 5.678901, + "HShift_local": 6.789012, + "X": 10.123456, + "Y": 20.234567, + "Z": 30.345678, + "Xstar": 40.456789, + "Zstar": 50.567890, + "speed": 1.111111, + "speed_local": 2.222222, + "acceleration": 3.333333, + "acceleration_local": 4.444444, + "T_node": 25.5, + "load_value": 100.5, + "water_level": 50.5, + "pressure": 1.013, + "AlfaX": 0.123456, + "AlfaY": 0.234567, + "Area": 100.5, + } + + # Transform + pg_row = DataTransformer.transform_elabdatadisp_row(mysql_row) + + # Verify + assert pg_row["id_elab_data"] == 5000 + assert pg_row["state"] == "OK" + assert isinstance(pg_row["measurements"], dict) + assert "shifts" in pg_row["measurements"] + assert "coordinates" in pg_row["measurements"] + assert "kinematics" in pg_row["measurements"] + assert pg_row["measurements"]["shifts"]["x"] == 1.234567 + assert pg_row["measurements"]["coordinates"]["x"] == 10.123456 + assert pg_row["measurements"]["kinematics"]["speed"] == 1.111111 + + def test_column_order_rawdatacor(self): + """Test column order for RAWDATACOR.""" + columns = DataTransformer.get_column_order("rawdatacor") + assert isinstance(columns, list) + assert "id" in columns + assert "measurements" in columns + assert "unit_name" in columns + + def test_column_order_elabdatadisp(self): + """Test column order for ELABDATADISP.""" + columns = DataTransformer.get_column_order("elabdatadisp") + assert isinstance(columns, list) + assert "id_elab_data" in columns + assert "measurements" in columns + assert "state" in columns + + +class TestFieldMapping: + """Test field mapping configuration.""" + + def test_all_rawdatacor_columns_mapped(self): + """Test that all RAWDATACOR value columns are defined.""" + for val_col in RAWDATACOR_COLUMNS["val_columns"]: + assert val_col.startswith("Val") + + def test_all_elabdatadisp_fields_mapped(self): + """Test that all ELABDATADISP fields are mapped.""" + mapped_fields = set(ELABDATADISP_FIELD_MAPPING.keys()) + assert len(mapped_fields) > 20 # Should have many fields + + +if __name__ == "__main__": + pytest.main([__file__, "-v"])