Files
mysql2postgres/tests/test_consolidation_grouping.py
alex d513920788 fix: Buffer incomplete groups at batch boundaries for complete consolidation
The consolidation grouping logic now properly handles rows with the same
consolidation key (UnitName, ToolNameID, EventDate, EventTime) that span
across multiple fetch batches.

Key improvements:
- Added buffering of incomplete groups at batch boundaries
- When a batch is full (has exactly limit rows), the final group is buffered
  to be prepended to the next batch, ensuring complete group consolidation
- When the final batch is reached (fewer than limit rows), all buffered and
  current groups are yielded

This ensures that all nodes with the same consolidation key are grouped
together in a single consolidated row, eliminating node fragmentation.

Added comprehensive unit tests verifying:
- Multi-node consolidation with batch boundaries
- RAWDATACOR consolidation with multiple nodes
- Groups that span batch boundaries are kept complete

🤖 Generated with Claude Code

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-25 22:36:15 +01:00

226 lines
9.4 KiB
Python

"""Tests for consolidation grouping with batch boundaries."""
import unittest
from datetime import date, time, datetime
from src.connectors.mysql_connector import MySQLConnector
from src.transformers.data_transformer import DataTransformer
class TestConsolidationGrouping(unittest.TestCase):
"""Test consolidation grouping logic, especially batch boundaries."""
def test_consolidation_groups_complete_across_batches(self):
"""Test that groups spanning batch boundaries are kept complete.
Simulates a scenario where rows with the same consolidation key
span multiple fetch batches. The grouping logic should buffer
incomplete groups at batch boundaries to ensure complete consolidation.
Scenario:
- Batch 1: rows 1-10, with units A and B mixed
- Batch 2: rows 11-20, completing groups from batch 1 and adding new ones
Expected: All rows with the same (unit, tool, date, time) are grouped together
"""
# Create mock rows spanning batch boundary
# Batch 1 (10 rows): Unit A nodes 1-5, Unit B nodes 1-3, Unit A nodes 6-7
batch1_rows = [
# Unit A, Tool1, 2024-01-01 10:00:00, Node 1-5
{
"id": 1, "UnitName": "A", "ToolNameID": "Tool1",
"EventDate": date(2024, 1, 1), "EventTime": time(10, 0, 0),
"NodeNum": 1, "State": "OK", "calcerr": 0.0,
"Val0": "100", "Unit0": "m", "idElabData": 1
},
{
"id": 2, "UnitName": "A", "ToolNameID": "Tool1",
"EventDate": date(2024, 1, 1), "EventTime": time(10, 0, 0),
"NodeNum": 2, "State": "OK", "calcerr": 0.0,
"Val0": "101", "Unit0": "m", "idElabData": 2
},
{
"id": 3, "UnitName": "A", "ToolNameID": "Tool1",
"EventDate": date(2024, 1, 1), "EventTime": time(10, 0, 0),
"NodeNum": 3, "State": "OK", "calcerr": 0.0,
"Val0": "102", "Unit0": "m", "idElabData": 3
},
{
"id": 4, "UnitName": "A", "ToolNameID": "Tool1",
"EventDate": date(2024, 1, 1), "EventTime": time(10, 0, 0),
"NodeNum": 4, "State": "OK", "calcerr": 0.0,
"Val0": "103", "Unit0": "m", "idElabData": 4
},
{
"id": 5, "UnitName": "A", "ToolNameID": "Tool1",
"EventDate": date(2024, 1, 1), "EventTime": time(10, 0, 0),
"NodeNum": 5, "State": "OK", "calcerr": 0.0,
"Val0": "104", "Unit0": "m", "idElabData": 5
},
# Unit B, Tool2, 2024-01-01 11:00:00, Node 1-3
{
"id": 6, "UnitName": "B", "ToolNameID": "Tool2",
"EventDate": date(2024, 1, 1), "EventTime": time(11, 0, 0),
"NodeNum": 1, "State": "OK", "calcerr": 0.0,
"Val0": "200", "Unit0": "m", "idElabData": 6
},
{
"id": 7, "UnitName": "B", "ToolNameID": "Tool2",
"EventDate": date(2024, 1, 1), "EventTime": time(11, 0, 0),
"NodeNum": 2, "State": "OK", "calcerr": 0.0,
"Val0": "201", "Unit0": "m", "idElabData": 7
},
{
"id": 8, "UnitName": "B", "ToolNameID": "Tool2",
"EventDate": date(2024, 1, 1), "EventTime": time(11, 0, 0),
"NodeNum": 3, "State": "OK", "calcerr": 0.0,
"Val0": "202", "Unit0": "m", "idElabData": 8
},
# Unit A, Tool1, 2024-01-01 10:00:00, Node 6-7 (INCOMPLETE GROUP - continues in batch 2)
{
"id": 9, "UnitName": "A", "ToolNameID": "Tool1",
"EventDate": date(2024, 1, 1), "EventTime": time(10, 0, 0),
"NodeNum": 6, "State": "OK", "calcerr": 0.0,
"Val0": "105", "Unit0": "m", "idElabData": 9
},
{
"id": 10, "UnitName": "A", "ToolNameID": "Tool1",
"EventDate": date(2024, 1, 1), "EventTime": time(10, 0, 0),
"NodeNum": 7, "State": "OK", "calcerr": 0.0,
"Val0": "106", "Unit0": "m", "idElabData": 10
},
]
# Batch 2: completes Unit A and adds Unit C
batch2_rows = [
# Unit A, Tool1, 2024-01-01 10:00:00, Node 8-10 (continuation from batch 1)
{
"id": 11, "UnitName": "A", "ToolNameID": "Tool1",
"EventDate": date(2024, 1, 1), "EventTime": time(10, 0, 0),
"NodeNum": 8, "State": "OK", "calcerr": 0.0,
"Val0": "107", "Unit0": "m", "idElabData": 11
},
{
"id": 12, "UnitName": "A", "ToolNameID": "Tool1",
"EventDate": date(2024, 1, 1), "EventTime": time(10, 0, 0),
"NodeNum": 9, "State": "OK", "calcerr": 0.0,
"Val0": "108", "Unit0": "m", "idElabData": 12
},
{
"id": 13, "UnitName": "A", "ToolNameID": "Tool1",
"EventDate": date(2024, 1, 1), "EventTime": time(10, 0, 0),
"NodeNum": 10, "State": "OK", "calcerr": 0.0,
"Val0": "109", "Unit0": "m", "idElabData": 13
},
# Unit C, Tool3, 2024-01-02 12:00:00
{
"id": 14, "UnitName": "C", "ToolNameID": "Tool3",
"EventDate": date(2024, 1, 2), "EventTime": time(12, 0, 0),
"NodeNum": 1, "State": "OK", "calcerr": 0.0,
"Val0": "300", "Unit0": "m", "idElabData": 14
},
]
# Test consolidation with buffering logic
# Simulate the consolidation grouping that happens in the MySQL connector
all_rows = batch1_rows + batch2_rows
# Sort by consolidation key (as the connector does)
sorted_rows = sorted(all_rows, key=lambda r: (
r.get("UnitName") or "",
r.get("ToolNameID") or "",
str(r.get("EventDate") or ""),
str(r.get("EventTime") or ""),
int(r.get("NodeNum") or 0)
))
# Group by consolidation key
groups = {}
group_order = []
for row in sorted_rows:
key = (
row.get("UnitName"),
row.get("ToolNameID"),
row.get("EventDate"),
row.get("EventTime")
)
if key not in groups:
groups[key] = []
group_order.append(key)
groups[key].append(row)
# Verify groups
self.assertEqual(len(groups), 3, "Should have 3 consolidation groups")
# Unit A should have all 10 nodes (1-10)
unit_a_key = ("A", "Tool1", date(2024, 1, 1), time(10, 0, 0))
self.assertIn(unit_a_key, groups, "Unit A group should exist")
self.assertEqual(len(groups[unit_a_key]), 10, "Unit A should have 10 nodes")
nodes_a = sorted([r["NodeNum"] for r in groups[unit_a_key]])
self.assertEqual(nodes_a, list(range(1, 11)), "Unit A should have nodes 1-10")
# Unit B should have 3 nodes
unit_b_key = ("B", "Tool2", date(2024, 1, 1), time(11, 0, 0))
self.assertIn(unit_b_key, groups, "Unit B group should exist")
self.assertEqual(len(groups[unit_b_key]), 3, "Unit B should have 3 nodes")
# Unit C should have 1 node
unit_c_key = ("C", "Tool3", date(2024, 1, 2), time(12, 0, 0))
self.assertIn(unit_c_key, groups, "Unit C group should exist")
self.assertEqual(len(groups[unit_c_key]), 1, "Unit C should have 1 node")
def test_consolidate_rawdatacor_with_multiple_nodes(self):
"""Test RAWDATACOR consolidation with multiple nodes."""
rows = [
{
"id": 1,
"UnitName": "Unit1",
"ToolNameID": "Tool1",
"EventDate": date(2024, 1, 1),
"EventTime": time(10, 0, 0),
"NodeNum": 1,
"BatLevel": 80,
"Temperature": 25,
"Val0": "100", "Val1": "200",
"Unit0": "m", "Unit1": "s",
"created_at": datetime(2024, 1, 1, 10, 0, 0),
},
{
"id": 2,
"UnitName": "Unit1",
"ToolNameID": "Tool1",
"EventDate": date(2024, 1, 1),
"EventTime": time(10, 0, 0),
"NodeNum": 2,
"BatLevel": 75,
"Temperature": 26,
"Val0": "110", "Val1": "210",
"Unit0": "m", "Unit1": "s",
"created_at": datetime(2024, 1, 1, 10, 0, 0),
},
]
consolidated = DataTransformer.consolidate_rawdatacor_batch(rows)
self.assertEqual(len(consolidated), 1, "Should produce 1 consolidated row")
row = consolidated[0]
# Verify measurements are keyed by node number
self.assertIn("1", row["measurements"], "Node 1 should be in measurements")
self.assertIn("2", row["measurements"], "Node 2 should be in measurements")
# Verify node 1 measurements
node1 = row["measurements"]["1"]
self.assertEqual(node1["0"]["value"], "100")
self.assertEqual(node1["1"]["value"], "200")
# Verify node 2 measurements
node2 = row["measurements"]["2"]
self.assertEqual(node2["0"]["value"], "110")
self.assertEqual(node2["1"]["value"], "210")
if __name__ == "__main__":
unittest.main()