fix: Fix duplicate group insertion in consolidation generator
Critical bug: current_group and current_key were inside the while loop, causing them to be reset on each batch iteration. When an incomplete group spanned a batch boundary, it would be: 1. Buffered at end of batch N (in local current_group) 2. LOST when loop continued (new local variables created) 3. Re-fetched and yielded again in batch N+1 This caused the same consolidated record to be inserted many times. Solution: Move current_group and current_key OUTSIDE while loop to persist across batch iterations. Incomplete groups now properly merge across batch boundaries without duplication. Algorithm: - Only yield groups when we're 100% certain they're complete - A group is complete when the next key differs from current key - At batch boundaries, incomplete groups stay buffered for next batch - Resume always uses last_completed_key to avoid re-processing This fixes the user's observation of 27 identical rows for the same consolidated record. 🤖 Generated with Claude Code Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
63
test_generator_output.py
Normal file
63
test_generator_output.py
Normal file
@@ -0,0 +1,63 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Debug what the generator is actually returning."""
|
||||
import sys
|
||||
sys.path.insert(0, '/home/alex/devel/mysql2postgres')
|
||||
|
||||
from src.connectors.mysql_connector import MySQLConnector
|
||||
from src.utils.logger import setup_logger, get_logger
|
||||
|
||||
setup_logger(__name__)
|
||||
logger = get_logger(__name__)
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("Testing consolidation groups generator for d1")
|
||||
print("="*80 + "\n")
|
||||
|
||||
with MySQLConnector() as mysql_conn:
|
||||
partition = "d1"
|
||||
group_num = 0
|
||||
# Use datetime objects to match what the generator uses
|
||||
import datetime
|
||||
target_key = ("ID0003", "DT0002", datetime.date(2014, 8, 31), datetime.timedelta(hours=11, minutes=59, seconds=10))
|
||||
|
||||
print("First 20 groups from generator:\n")
|
||||
print("DEBUG: First row columns:", flush=True)
|
||||
|
||||
for group_rows in mysql_conn.fetch_consolidation_groups_from_partition(
|
||||
"ELABDATADISP",
|
||||
partition,
|
||||
limit=100
|
||||
):
|
||||
group_num += 1
|
||||
if group_rows:
|
||||
first_row = group_rows[0]
|
||||
|
||||
# Debug: print all columns from first group
|
||||
if group_num == 1:
|
||||
print(f" Available columns: {first_row.keys()}\n")
|
||||
print(f" First row data: {dict(first_row)}\n")
|
||||
|
||||
key = (
|
||||
first_row.get("UnitName"),
|
||||
first_row.get("ToolNameID"),
|
||||
str(first_row.get("EventDate")),
|
||||
str(first_row.get("EventTime"))
|
||||
)
|
||||
nodes = sorted([r.get('NodeNum') for r in group_rows])
|
||||
|
||||
# Show first 20 groups or target key
|
||||
if group_num <= 20 or key == target_key:
|
||||
print(f"Group {group_num}: key={key}")
|
||||
print(f" Nodes ({len(nodes)}): {nodes}")
|
||||
print(f" Rows count: {len(group_rows)}\n")
|
||||
|
||||
if key == target_key:
|
||||
print("^^^ THIS IS THE TARGET KEY! ^^^\n")
|
||||
break
|
||||
|
||||
if group_num >= 100:
|
||||
print(f"\nStopped at group {group_num}")
|
||||
break
|
||||
|
||||
print(f"\nTotal groups processed: {group_num}")
|
||||
print("Done!\n")
|
||||
Reference in New Issue
Block a user