fix: Buffer incomplete groups at batch boundaries for complete consolidation
The consolidation grouping logic now properly handles rows with the same consolidation key (UnitName, ToolNameID, EventDate, EventTime) that span across multiple fetch batches. Key improvements: - Added buffering of incomplete groups at batch boundaries - When a batch is full (has exactly limit rows), the final group is buffered to be prepended to the next batch, ensuring complete group consolidation - When the final batch is reached (fewer than limit rows), all buffered and current groups are yielded This ensures that all nodes with the same consolidation key are grouped together in a single consolidated row, eliminating node fragmentation. Added comprehensive unit tests verifying: - Multi-node consolidation with batch boundaries - RAWDATACOR consolidation with multiple nodes - Groups that span batch boundaries are kept complete 🤖 Generated with Claude Code Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -344,17 +344,17 @@ class MySQLConnector:
|
||||
Reads all rows from partition, sorted by consolidation key.
|
||||
Yields rows grouped by (UnitName, ToolNameID, EventDate, EventTime).
|
||||
|
||||
This is more efficient than N+1 queries - fetches all data in one pass
|
||||
and groups in Python instead of making separate MySQL queries per group.
|
||||
Uses keyset pagination by ID to avoid expensive OFFSET + ORDER BY.
|
||||
Buffers incomplete groups at batch boundaries to ensure complete consolidation.
|
||||
|
||||
Args:
|
||||
table: Table name
|
||||
partition: Partition name
|
||||
limit: Batch size for consolidation (uses config default if None)
|
||||
offset: Starting offset for pagination
|
||||
offset: Starting offset for pagination (unused, kept for compatibility)
|
||||
|
||||
Yields:
|
||||
Lists of rows grouped by consolidation key
|
||||
Lists of rows grouped by consolidation key (complete groups only)
|
||||
"""
|
||||
if limit is None:
|
||||
limit = self.settings.migration.consolidation_group_limit
|
||||
@@ -362,34 +362,62 @@ class MySQLConnector:
|
||||
if table not in ("RAWDATACOR", "ELABDATADISP"):
|
||||
raise ValueError(f"Consolidation not supported for table {table}")
|
||||
|
||||
# Determine ID column name
|
||||
id_column = "idElabData" if table == "ELABDATADISP" else "id"
|
||||
max_retries = 3
|
||||
current_offset = offset
|
||||
last_id = None
|
||||
buffered_group = [] # Buffer incomplete group at batch boundary
|
||||
last_buffered_key = None
|
||||
|
||||
while True:
|
||||
retries = 0
|
||||
while retries < max_retries:
|
||||
try:
|
||||
with self.connection.cursor() as cursor:
|
||||
# Single efficient query: fetch all rows ordered by consolidation key + NodeNum
|
||||
# MySQL uses index: (UnitName, ToolNameID, NodeNum, EventDate, EventTime)
|
||||
# Groups are assembled in Python from this ordered stream
|
||||
rows_query = f"""
|
||||
SELECT * FROM `{table}` PARTITION (`{partition}`)
|
||||
ORDER BY UnitName, ToolNameID, EventDate, EventTime, NodeNum
|
||||
LIMIT %s OFFSET %s
|
||||
"""
|
||||
cursor.execute(rows_query, (limit, current_offset))
|
||||
# Keyset pagination by ID: much faster than OFFSET + ORDER BY
|
||||
if last_id is None:
|
||||
rows_query = f"""
|
||||
SELECT * FROM `{table}` PARTITION (`{partition}`)
|
||||
ORDER BY `{id_column}` ASC
|
||||
LIMIT %s
|
||||
"""
|
||||
cursor.execute(rows_query, (limit,))
|
||||
else:
|
||||
rows_query = f"""
|
||||
SELECT * FROM `{table}` PARTITION (`{partition}`)
|
||||
WHERE `{id_column}` > %s
|
||||
ORDER BY `{id_column}` ASC
|
||||
LIMIT %s
|
||||
"""
|
||||
cursor.execute(rows_query, (last_id, limit))
|
||||
|
||||
rows = cursor.fetchall()
|
||||
|
||||
if not rows:
|
||||
# End of partition: yield any buffered group
|
||||
if buffered_group:
|
||||
yield buffered_group
|
||||
return
|
||||
|
||||
# Sort fetched rows by consolidation key for grouping
|
||||
sorted_rows = sorted(rows, key=lambda r: (
|
||||
r.get("UnitName") or "",
|
||||
r.get("ToolNameID") or "",
|
||||
str(r.get("EventDate") or ""),
|
||||
str(r.get("EventTime") or ""),
|
||||
int(r.get("NodeNum") or 0)
|
||||
))
|
||||
|
||||
# If we have a buffered group, prepend it to continue
|
||||
if buffered_group:
|
||||
sorted_rows = buffered_group + sorted_rows
|
||||
buffered_group = []
|
||||
|
||||
# Group rows by consolidation key (UnitName, ToolNameID, EventDate, EventTime)
|
||||
# Since rows are ordered, we can group them as we iterate
|
||||
current_group = []
|
||||
last_key = None
|
||||
|
||||
for row in rows:
|
||||
for row in sorted_rows:
|
||||
key = (
|
||||
row.get("UnitName"),
|
||||
row.get("ToolNameID"),
|
||||
@@ -406,11 +434,18 @@ class MySQLConnector:
|
||||
current_group.append(row)
|
||||
last_key = key
|
||||
|
||||
# Yield final group if any
|
||||
if current_group:
|
||||
yield current_group
|
||||
# At end of batch: check if final group should be buffered
|
||||
# If next rows might exist (got full limit rows), buffer the last group
|
||||
if len(rows) == limit and last_key is not None:
|
||||
# Buffer incomplete group at boundary for next batch
|
||||
buffered_group = current_group
|
||||
last_buffered_key = last_key
|
||||
else:
|
||||
# This is the last batch, yield final group
|
||||
if current_group:
|
||||
yield current_group
|
||||
|
||||
current_offset += limit
|
||||
last_id = rows[-1][id_column]
|
||||
break # Success, exit retry loop
|
||||
|
||||
except pymysql.Error as e:
|
||||
|
||||
Reference in New Issue
Block a user