refactor main_rotation_cycle to streamline node processing and container management

This commit is contained in:
vvzvlad
2025-08-14 01:17:20 +03:00
parent 72a9804c69
commit 51d6fa568d

View File

@ -281,97 +281,83 @@ def main_rotation_cycle():
logger.info("Starting nexus rotation cycle")
while True:
cycle_count += 1
logger.info(f"=== Starting cycle #{cycle_count} ===")
try:
# Get next node
node = get_next_node(grist, logger)
if not node:
logger.error("No node available, waiting 60 seconds before retry")
time.sleep(60)
continue
# Get next node
node = get_next_node(grist, logger)
if not node:
logger.error("No node available, waiting 60 seconds before retry")
time.sleep(60)
return
node_id = node.NodeID
current_hours = int(node.Hours)
node_id = node.NodeID
current_hours = int(node.Hours)
# Update hours (+1 before starting container)
new_hours = current_hours + 1
grist.update(node.id, {"Hours": new_hours}, "Nodes")
logger.info(f"Updated node {node_id} hours: {current_hours} -> {new_hours}")
# Update hours (+1 before starting container)
new_hours = current_hours + 1
grist.update(node.id, {"Hours": new_hours}, "Nodes")
logger.info(f"Updated node {node_id} hours: {current_hours} -> {new_hours}")
# Remove any existing container with same name
# Remove any existing container with same name
stop_and_remove_container(container_name, logger)
# Start new container
if not start_container(container_name, node_id, logger):
logger.error(f"Failed to start container for node {node_id}")
# Return the hour back since container didn't start
grist.update(node.id, {"Hours": current_hours}, "Nodes")
logger.info(f"Reverted node {node_id} hours back to: {current_hours}")
time.sleep(60)
return
logger.info(f"Container started successfully for node {node_id}")
# Wait 5 hours with progress updates and health checks every 10 minutes
wait_hours = 5
total_minutes = wait_hours * 60
interval_minutes = 10
logger.info(f"Waiting {wait_hours} hours ({total_minutes} minutes) for node {node_id}")
container_failed = False
for elapsed_minutes in range(0, total_minutes, interval_minutes):
remaining_minutes = total_minutes - elapsed_minutes
remaining_hours = remaining_minutes // 60
remaining_mins = remaining_minutes % 60
if elapsed_minutes > 0: # Skip first iteration log
logger.info(f"Node {node_id}: {remaining_hours}h {remaining_mins}m remaining")
# Check container status before sleeping
if not check_container_status(container_name, logger):
logger.error(f"Container {container_name} is not running, attempting restart")
stop_and_remove_container(container_name, logger)
# Start new container
if not start_container(container_name, node_id, logger):
logger.error(f"Failed to start container for node {node_id}")
# Return the hour back since container didn't start
grist.update(node.id, {"Hours": current_hours}, "Nodes")
logger.info(f"Reverted node {node_id} hours back to: {current_hours}")
time.sleep(60)
continue
logger.error(f"Failed to restart container for node {node_id}")
container_failed = True
break
else:
logger.info(f"Container restarted successfully for node {node_id}")
time.sleep(interval_minutes * 60) # Sleep 10 minutes
# If container failed during the cycle, skip to next iteration
if container_failed:
logger.error(f"Container failed during cycle for node {node_id}, moving to next node")
return
logger.info(f"Container started successfully for node {node_id}")
# Wait 5 hours with progress updates and health checks every 10 minutes
wait_hours = 5
total_minutes = wait_hours * 60
interval_minutes = 10
logger.info(f"Waiting {wait_hours} hours ({total_minutes} minutes) for node {node_id}")
container_failed = False
for elapsed_minutes in range(0, total_minutes, interval_minutes):
remaining_minutes = total_minutes - elapsed_minutes
remaining_hours = remaining_minutes // 60
remaining_mins = remaining_minutes % 60
if elapsed_minutes > 0: # Skip first iteration log
logger.info(f"Node {node_id}: {remaining_hours}h {remaining_mins}m remaining")
# Check container status before sleeping
if not check_container_status(container_name, logger):
logger.error(f"Container {container_name} is not running, attempting restart")
stop_and_remove_container(container_name, logger)
if not start_container(container_name, node_id, logger):
logger.error(f"Failed to restart container for node {node_id}")
container_failed = True
break
else:
logger.info(f"Container restarted successfully for node {node_id}")
time.sleep(interval_minutes * 60) # Sleep 10 minutes
# If container failed during the cycle, skip to next iteration
if container_failed:
logger.error(f"Container failed during cycle for node {node_id}, moving to next node")
continue
# Stop and remove container
logger.info(f"5 hours completed for node {node_id}, stopping container")
stop_and_remove_container(container_name, logger)
# Stop and remove container
logger.info(f"5 hours completed for node {node_id}, stopping container")
stop_and_remove_container(container_name, logger)
# Update hours (+4 after completion)
final_hours = new_hours + 4
grist.update(node.id, {"Hours": final_hours}, "Nodes")
logger.info(f"Updated node {node_id} final hours: {new_hours} -> {final_hours}")
# Update hours (+4 after completion)
final_hours = new_hours + 4
grist.update(node.id, {"Hours": final_hours}, "Nodes")
logger.info(f"Updated node {node_id} final hours: {new_hours} -> {final_hours}")
logger.info(f"=== Cycle #{cycle_count} completed for node {node_id} ===")
logger.info(f"=== Cycle #{cycle_count} completed for node {node_id} ===")
except KeyboardInterrupt:
logger.info("Received keyboard interrupt, stopping rotation")
stop_and_remove_container(container_name, logger)
break
except Exception as e:
logger.error(f"Cycle #{cycle_count} failed with error: {str(e)}")
stop_and_remove_container(container_name, logger)
logger.info("Waiting 60 seconds before next attempt")
time.sleep(60)
logger.info("Nexus rotation cycle stopped")
if __name__ == "__main__":