#!/bin/bash
set -euo pipefail

# Available env vars:
#   $TMP_DIR
#   $CLUSTER_NAME
#   $KUBECONFIG
#   $AEMM_URL
#   $AEMM_VERSION

function fail_and_exit {
    echo "❌ Scheduled Maintenance Event System Reboot Test failed $CLUSTER_NAME ❌"
    exit ${1:-1}
}

echo "Starting Maintenance Event Cancellation Test for Node Termination Handler"

SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"

common_helm_args=()
[[ "${TEST_WINDOWS-}" == "true" ]] && common_helm_args+=(--set targetNodeOs="windows")
[[ -n "${NTH_WORKER_LABEL-}" ]] && common_helm_args+=(--set nodeSelector."$NTH_WORKER_LABEL")

anth_helm_args=(
  upgrade
  --install
  "$CLUSTER_NAME-anth"
  "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/"
  --wait
  --force
  --namespace kube-system
  --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}"
  --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO"
  --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG"
  --set enableSpotInterruptionDraining="true"
  --set enableScheduledEventDraining="true"
  --set taintNode="true"
)
[[ -n "${NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY-}" ]] &&
    anth_helm_args+=(--set image.pullPolicy="$NODE_TERMINATION_HANDLER_DOCKER_PULL_POLICY")
[[ ${#common_helm_args[@]} -gt 0 ]] &&
    anth_helm_args+=("${common_helm_args[@]}")

set -x
helm "${anth_helm_args[@]}"
set +x

emtp_helm_args=(
  upgrade
  --install
  "$CLUSTER_NAME-emtp"
  "$SCRIPTPATH/../../config/helm/webhook-test-proxy/"
  --wait
  --namespace default
  --set webhookTestProxy.image.repository="$WEBHOOK_DOCKER_REPO"
  --set webhookTestProxy.image.tag="$WEBHOOK_DOCKER_TAG"
)
[[ -n "${WEBHOOK_DOCKER_PULL_POLICY-}" ]] &&
    emtp_helm_args+=(--set webhookTestProxy.image.pullPolicy="$WEBHOOK_DOCKER_PULL_POLICY")
[[ ${#common_helm_args[@]} -gt 0 ]] &&
    emtp_helm_args+=("${common_helm_args[@]}")

set -x
helm "${emtp_helm_args[@]}"
set +x

aemm_helm_args=(
  upgrade
  --install
  "$CLUSTER_NAME-aemm"
  "$AEMM_DL_URL"
  --wait
  --namespace default
  --set servicePort="$IMDS_PORT"
  --set arguments='{events}'
)
[[ ${#common_helm_args[@]} -gt 0 ]] &&
    aemm_helm_args+=("${common_helm_args[@]}")

set -x
retry 5 helm "${aemm_helm_args[@]}"
set +x

TAINT_CHECK_CYCLES=15
TAINT_CHECK_SLEEP=15

deployed=0
for i in `seq 1 $TAINT_CHECK_CYCLES`; do
    if [[ $(kubectl get deployments regular-pod-test -o jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then
        echo "✅ Verified regular-pod-test pod was scheduled and started!"
        deployed=1
        break
    fi
    echo "Setup Loop $i/$TAINT_CHECK_SLEEP, sleeping for $TAINT_CHECK_SLEEP seconds"
    sleep $TAINT_CHECK_SLEEP
done

if [[ $deployed -eq 0 ]]; then
    echo "❌ Failed test setup for regular-pod"
    fail_and_exit 2
fi

cordoned=0
tainted=0
test_node="${TEST_NODE:-$CLUSTER_NAME-worker}"
for i in `seq 1 $TAINT_CHECK_CYCLES`; do
    if [[ $cordoned -eq 0 ]] && kubectl get nodes $test_node | grep SchedulingDisabled >/dev/null; then
        echo "✅ Verified the worker node was cordoned for maintenance event reboot!"
        cordoned=1
    fi

    if [[ $cordoned -eq 1 ]] && kubectl get nodes $test_node -o json | grep "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then
      echo "✅ Verified the worker node was tainted!"
      tainted=1
    fi

    if [[ $tainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 1 ]]; then
        echo "✅ Verified the regular-pod-test pod was evicted!"
        break
    fi
    echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
    sleep $TAINT_CHECK_SLEEP
done

if [[ $cordoned -eq 0 ]]; then
    echo "❌ Failed cordoning node for scheduled maintenance event"
    fail_and_exit 3
fi

if [[ $tainted -eq 0 ]]; then
    echo "❌ Failed tainting node for scheduled maintenance event"
    fail_and_exit 3
fi

mock_uptime_filepath="/uptime"
if [[ "${TEST_WINDOWS:-"false"}" != "true" ]]; then
    echo "Copy uptime file to Kind k8s nodes"
    for node in $(kubectl get nodes -o json | jq -r '.items[].metadata.name'); do
        docker exec "$node" sh -c "rm -rf $mock_uptime_filepath"
        docker cp "$SCRIPTPATH/../assets/uptime-reboot" "$node:$mock_uptime_filepath"
        docker exec "$node" sh -c "chmod 0444 $mock_uptime_filepath && chown root $mock_uptime_filepath && chgrp root $mock_uptime_filepath"
    done
else
    echo "Copy uptime file to $TEST_NODE"
    kubectl cp "$SCRIPTPATH/../assets/uptime-root kube-system/$(get_nth_worker_pod):$mock_uptime_filepath"
fi

echo "Remove amazon-ec2-metadata-mock to prevent another drain event"
deployment=$(kubectl get deployments | grep 'amazon-ec2-metadata-mock' | cut -d' ' -f1)
kubectl delete deployments "$deployment"

## Restart NTH which will simulate a system reboot by mounting a new uptime file
anth_helm_args=(
  upgrade
  --install
  "$CLUSTER_NAME-anth"
  "$SCRIPTPATH/../../config/helm/aws-node-termination-handler/"
  --wait
  --force
  --namespace kube-system
  --set instanceMetadataURL="${INSTANCE_METADATA_URL:-"http://$AEMM_URL:$IMDS_PORT"}"
  --set image.repository="$NODE_TERMINATION_HANDLER_DOCKER_REPO"
  --set image.tag="$NODE_TERMINATION_HANDLER_DOCKER_TAG"
  --set procUptimeFile="$mock_uptime_filepath"
  --set enableSpotInterruptionDraining="true"
  --set enableScheduledEventDraining="true"
  --set taintNode="true"
)
[[ ${#common_helm_args[@]} -gt 0 ]] &&
    anth_helm_args+=("${common_helm_args[@]}")

set -x
helm "${anth_helm_args[@]}"
set +x

uncordoned=0
untainted=0
for i in `seq 1 $TAINT_CHECK_CYCLES`; do
    NODE_LINE=$(kubectl get nodes $test_node | grep -v 'STATUS')
    if [[ $uncordoned -eq 0 && -z $(echo $NODE_LINE | grep SchedulingDisabled) ]] && [[ ! -z $(echo $NODE_LINE | grep Ready) ]]; then
        echo "✅ Verified the worker node was uncordoned!"
        uncordoned=1
    fi

    if [[ $uncordoned -eq 1 && $untainted -eq 0 ]] && ! kubectl get nodes $test_node -o json | grep -q "aws-node-termination-handler/scheduled-maintenance" >/dev/null; then
      echo "✅ Verified the worked node was untainted!"
      untainted=1
    fi

    if [[ $untainted -eq 1 && $(kubectl get deployments regular-pod-test -o=jsonpath='{.status.unavailableReplicas}') -eq 0 ]]; then
        echo "✅ Verified the regular-pod-test pod was rescheduled"
        echo "✅ Scheduled Maintenance Event System Reboot Test Passed $CLUSTER_NAME! ✅"
        exit 0
    fi
    echo "Assertion Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
    sleep $TAINT_CHECK_SLEEP
done

if [[ $uncordoned -eq 0 ]]; then
    echo "❌ Worker node was not UNcordoned"
elif [[ $untainted -eq 0 ]]; then
    echo "❌ Worked node was not UNtainted"
else
    echo "❌ regular-pod-test pod was not rescheduled"
fi
fail_and_exit 1
