#!/bin/bash
set -euo pipefail

# Available env vars:
#   $TMP_DIR
#   $CLUSTER_NAME
#   $KUBECONFIG

echo "Starting Maintenance Event Cancellation Test for Node Termination Handler"

SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"

helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ \
  --wait \
  --force \
  --namespace kube-system \
  --set instanceMetadataURL="http://localhost:$IMDS_PORT" \
  --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \
  --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \
  --set enableSpotInterruptionDraining="true" \
  --set enableScheduledEventDraining="true" \
  --set taintNode="true"

helm upgrade --install $CLUSTER_NAME-emtp $SCRIPTPATH/../../config/helm/ec2-metadata-test-proxy/ \
  --wait \
  --force \
  --namespace default \
  --set ec2MetadataTestProxy.image.repository="$EC2_METADATA_DOCKER_REPO" \
  --set ec2MetadataTestProxy.image.tag="$EC2_METADATA_DOCKER_TAG" \
  --set ec2MetadataTestProxy.enableScheduledMaintenanceEvents="true" \
  --set ec2MetadataTestProxy.enableSpotITN="false" \
  --set ec2MetadataTestProxy.port="$IMDS_PORT"

TAINT_CHECK_CYCLES=15
TAINT_CHECK_SLEEP=15

DEPLOYED=0
CORDONED=0
TAINTED=0

for i in `seq 1 10`; do
    if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then
        echo "✅ Verified regular-pod-test pod was scheduled and started!"
        DEPLOYED=1
        break
    fi
    sleep 5
done

if [[ $DEPLOYED -eq 0 ]]; then
    echo "❌ Failed test setup for regular-pod"
    exit 2
fi

for i in `seq 1 $TAINT_CHECK_CYCLES`; do
    if kubectl get nodes $CLUSTER_NAME-worker | grep SchedulingDisabled; then
        echo "✅ Verified the worker node was cordoned for maintenance event reboot!"

        if kubectl get nodes $CLUSTER_NAME-worker -o json | grep "aws-node-termination-handler/scheduled-maintenance"; then
          echo "✅ Verified the worked node was tainted!"
          TAINTED=1
        fi

        if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
            echo "✅ Verified the regular-pod-test pod was evicted!"
            CORDONED=1
            break
        fi
    fi
    sleep $TAINT_CHECK_SLEEP
done

if [[ $CORDONED -eq 0 ]]; then
    echo "❌ Failed cordoning node for scheduled maintenance event"
    exit 3
fi

if [[ $TAINTED -eq 0 ]]; then
    echo "❌ Failed tainting node for scheduled maintenance event"
    exit 3
fi

## Copy uptime file to Kind k8s nodes
for node in $(kubectl get nodes -o json | jq -r '.items[].metadata.name'); do
    docker exec $node sh -c "rm -rf /uptime"
    docker cp $SCRIPTPATH/../assets/uptime-reboot $node:/uptime
    docker exec $node sh -c "chmod 0444 /uptime && chown root /uptime && chgrp root /uptime"
done

## Remove ec2-metadata-test-proxy to prevent another drain event but keep regular-test-pod
daemonset=$(kubectl get daemonsets | grep 'ec2-metadata-test-proxy' | cut -d' ' -f1)
kubectl delete daemonsets $daemonset

## Restart NTH which will simulate a system reboot by mounting a new uptime file
helm upgrade --install $CLUSTER_NAME-anth $SCRIPTPATH/../../config/helm/aws-node-termination-handler/ \
  --wait \
  --force \
  --namespace kube-system \
  --set instanceMetadataURL="http://localhost:$IMDS_PORT" \
  --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO" \
  --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG" \
  --set procUptimeFile="/uptime" \
  --set enableSpotInterruptionDraining="true" \
  --set enableScheduledEventDraining="true" \
  --set taintNode="true"

for i in `seq 1 $TAINT_CHECK_CYCLES`; do
    NODE_LINE=$(kubectl get nodes $CLUSTER_NAME-worker | grep -v 'STATUS')
    if [[ -z $(echo $NODE_LINE | grep SchedulingDisabled) ]] && [[ ! -z $(echo $NODE_LINE | grep Ready) ]]; then
        echo "✅ Verified the worker node was uncordoned!"

        if ! kubectl get nodes $CLUSTER_NAME-worker -o json | grep -q "aws-node-termination-handler/scheduled-maintenance"; then
          echo "✅ Verified the worked node was untainted!"
          TAINTED=0
        else
          echo "❌ Failed clearing the worked node taint after a reboot!"
          exit 3
        fi

        if [[ $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then
            echo "✅ Verified the regular-pod-test pod was rescheduled"
            echo "✅ Scheduled Maintenance Event System Reboot Test Passed $CLUSTER_NAME! ✅"
            exit 0
        fi
    fi
    sleep $TAINT_CHECK_SLEEP
done

exit 1
