From 51d6fa568d43e44ef1ce878b16640ea90d088a16 Mon Sep 17 00:00:00 2001 From: vvzvlad Date: Thu, 14 Aug 2025 01:17:20 +0300 Subject: [PATCH] refactor main_rotation_cycle to streamline node processing and container management --- rotate/rotate.py | 146 +++++++++++++++++++++-------------------------- 1 file changed, 66 insertions(+), 80 deletions(-) diff --git a/rotate/rotate.py b/rotate/rotate.py index b8c9c7a..87278b8 100644 --- a/rotate/rotate.py +++ b/rotate/rotate.py @@ -281,97 +281,83 @@ def main_rotation_cycle(): logger.info("Starting nexus rotation cycle") - while True: - cycle_count += 1 - logger.info(f"=== Starting cycle #{cycle_count} ===") - try: - # Get next node - node = get_next_node(grist, logger) - if not node: - logger.error("No node available, waiting 60 seconds before retry") - time.sleep(60) - continue + # Get next node + node = get_next_node(grist, logger) + if not node: + logger.error("No node available, waiting 60 seconds before retry") + time.sleep(60) + return - node_id = node.NodeID - current_hours = int(node.Hours) + node_id = node.NodeID + current_hours = int(node.Hours) - # Update hours (+1 before starting container) - new_hours = current_hours + 1 - grist.update(node.id, {"Hours": new_hours}, "Nodes") - logger.info(f"Updated node {node_id} hours: {current_hours} -> {new_hours}") + # Update hours (+1 before starting container) + new_hours = current_hours + 1 + grist.update(node.id, {"Hours": new_hours}, "Nodes") + logger.info(f"Updated node {node_id} hours: {current_hours} -> {new_hours}") - # Remove any existing container with same name + # Remove any existing container with same name + stop_and_remove_container(container_name, logger) + + # Start new container + if not start_container(container_name, node_id, logger): + logger.error(f"Failed to start container for node {node_id}") + # Return the hour back since container didn't start + grist.update(node.id, {"Hours": current_hours}, "Nodes") + logger.info(f"Reverted node {node_id} hours back to: {current_hours}") + time.sleep(60) + return + + logger.info(f"Container started successfully for node {node_id}") + + # Wait 5 hours with progress updates and health checks every 10 minutes + wait_hours = 5 + total_minutes = wait_hours * 60 + interval_minutes = 10 + + logger.info(f"Waiting {wait_hours} hours ({total_minutes} minutes) for node {node_id}") + + container_failed = False + for elapsed_minutes in range(0, total_minutes, interval_minutes): + remaining_minutes = total_minutes - elapsed_minutes + remaining_hours = remaining_minutes // 60 + remaining_mins = remaining_minutes % 60 + + if elapsed_minutes > 0: # Skip first iteration log + logger.info(f"Node {node_id}: {remaining_hours}h {remaining_mins}m remaining") + + # Check container status before sleeping + if not check_container_status(container_name, logger): + logger.error(f"Container {container_name} is not running, attempting restart") stop_and_remove_container(container_name, logger) - - # Start new container + if not start_container(container_name, node_id, logger): - logger.error(f"Failed to start container for node {node_id}") - # Return the hour back since container didn't start - grist.update(node.id, {"Hours": current_hours}, "Nodes") - logger.info(f"Reverted node {node_id} hours back to: {current_hours}") - time.sleep(60) - continue + logger.error(f"Failed to restart container for node {node_id}") + container_failed = True + break + else: + logger.info(f"Container restarted successfully for node {node_id}") + + time.sleep(interval_minutes * 60) # Sleep 10 minutes + + # If container failed during the cycle, skip to next iteration + if container_failed: + logger.error(f"Container failed during cycle for node {node_id}, moving to next node") + return - logger.info(f"Container started successfully for node {node_id}") - - # Wait 5 hours with progress updates and health checks every 10 minutes - wait_hours = 5 - total_minutes = wait_hours * 60 - interval_minutes = 10 - - logger.info(f"Waiting {wait_hours} hours ({total_minutes} minutes) for node {node_id}") - - container_failed = False - for elapsed_minutes in range(0, total_minutes, interval_minutes): - remaining_minutes = total_minutes - elapsed_minutes - remaining_hours = remaining_minutes // 60 - remaining_mins = remaining_minutes % 60 - - if elapsed_minutes > 0: # Skip first iteration log - logger.info(f"Node {node_id}: {remaining_hours}h {remaining_mins}m remaining") - - # Check container status before sleeping - if not check_container_status(container_name, logger): - logger.error(f"Container {container_name} is not running, attempting restart") - stop_and_remove_container(container_name, logger) - - if not start_container(container_name, node_id, logger): - logger.error(f"Failed to restart container for node {node_id}") - container_failed = True - break - else: - logger.info(f"Container restarted successfully for node {node_id}") - - time.sleep(interval_minutes * 60) # Sleep 10 minutes - - # If container failed during the cycle, skip to next iteration - if container_failed: - logger.error(f"Container failed during cycle for node {node_id}, moving to next node") - continue + # Stop and remove container + logger.info(f"5 hours completed for node {node_id}, stopping container") + stop_and_remove_container(container_name, logger) - # Stop and remove container - logger.info(f"5 hours completed for node {node_id}, stopping container") - stop_and_remove_container(container_name, logger) + # Update hours (+4 after completion) + final_hours = new_hours + 4 + grist.update(node.id, {"Hours": final_hours}, "Nodes") + logger.info(f"Updated node {node_id} final hours: {new_hours} -> {final_hours}") - # Update hours (+4 after completion) - final_hours = new_hours + 4 - grist.update(node.id, {"Hours": final_hours}, "Nodes") - logger.info(f"Updated node {node_id} final hours: {new_hours} -> {final_hours}") + logger.info(f"=== Cycle #{cycle_count} completed for node {node_id} ===") - logger.info(f"=== Cycle #{cycle_count} completed for node {node_id} ===") - except KeyboardInterrupt: - logger.info("Received keyboard interrupt, stopping rotation") - stop_and_remove_container(container_name, logger) - break - except Exception as e: - logger.error(f"Cycle #{cycle_count} failed with error: {str(e)}") - stop_and_remove_container(container_name, logger) - logger.info("Waiting 60 seconds before next attempt") - time.sleep(60) - - logger.info("Nexus rotation cycle stopped") if __name__ == "__main__":