package burlap.behavior.singleagent.learning.tdmethods;

import burlap.behavior.policy.Policy;
import burlap.behavior.singleagent.Episode;
import burlap.behavior.singleagent.options.EnvironmentOptionOutcome;
import burlap.behavior.singleagent.options.Option;
import burlap.behavior.valuefunction.QFunction;
import burlap.behavior.valuefunction.QValue;
import burlap.mdp.core.action.Action;
import burlap.mdp.core.state.State;
import burlap.mdp.singleagent.SADomain;
import burlap.mdp.singleagent.environment.Environment;
import burlap.mdp.singleagent.environment.EnvironmentOutcome;
import burlap.statehashing.HashableState;
import burlap.statehashing.HashableStateFactory;
import java.util.Iterator;
import java.util.LinkedList;

/* loaded from: input_file:burlap/behavior/singleagent/learning/tdmethods/SarsaLam.class */
public class SarsaLam extends QLearning {
    protected double lambda;

    /* loaded from: input_file:burlap/behavior/singleagent/learning/tdmethods/SarsaLam$EligibilityTrace.class */
    public static class EligibilityTrace {
        public double eligibility;
        public HashableState sh;
        public QValue q;
        public double initialQ;

        public EligibilityTrace(HashableState hashableState, QValue qValue, double d) {
            this.sh = hashableState;
            this.q = qValue;
            this.eligibility = d;
            this.initialQ = qValue.q;
        }
    }

    public SarsaLam(SADomain sADomain, double d, HashableStateFactory hashableStateFactory, double d2, double d3, double d4) {
        super(sADomain, d, hashableStateFactory, d2, d3);
        sarsalamInit(d4);
    }

    public SarsaLam(SADomain sADomain, double d, HashableStateFactory hashableStateFactory, double d2, double d3, int i, double d4) {
        super(sADomain, d, hashableStateFactory, d2, d3, i);
        sarsalamInit(d4);
    }

    public SarsaLam(SADomain sADomain, double d, HashableStateFactory hashableStateFactory, double d2, double d3, Policy policy, int i, double d4) {
        super(sADomain, d, hashableStateFactory, d2, d3, policy, i);
        sarsalamInit(d4);
    }

    public SarsaLam(SADomain sADomain, double d, HashableStateFactory hashableStateFactory, QFunction qFunction, double d2, Policy policy, int i, double d3) {
        super(sADomain, d, hashableStateFactory, qFunction, d2, policy, i);
        sarsalamInit(d3);
    }

    protected void sarsalamInit(double d) {
        this.lambda = d;
    }

    @Override // burlap.behavior.singleagent.learning.tdmethods.QLearning, burlap.behavior.singleagent.learning.LearningAgent
    public Episode runLearningEpisode(Environment environment, int i) {
        State currentObservation = environment.currentObservation();
        Episode episode = new Episode(currentObservation);
        this.maxQChangeInLastEpisode = 0.0d;
        HashableState stateHash = stateHash(currentObservation);
        this.eStepCounter = 0;
        LinkedList linkedList = new LinkedList();
        Action action = this.learningPolicy.action(stateHash.s());
        QValue q = getQ(stateHash, action);
        while (!environment.isInTerminalState() && (this.eStepCounter < i || i == -1)) {
            EnvironmentOutcome executeAction = !(action instanceof Option) ? environment.executeAction(action) : ((Option) action).control(environment, this.gamma);
            HashableState stateHash2 = stateHash(executeAction.op);
            Action action2 = this.learningPolicy.action(stateHash2.s());
            QValue q2 = getQ(stateHash2, action2);
            double d = q2.q;
            if (environment.isInTerminalState()) {
                d = 0.0d;
            }
            double d2 = executeAction.r;
            double d3 = executeAction instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome) executeAction).discount : this.gamma;
            this.eStepCounter += executeAction instanceof EnvironmentOptionOutcome ? ((EnvironmentOptionOutcome) executeAction).numSteps() : 1;
            if ((action instanceof Option) && this.shouldDecomposeOptions) {
                episode.appendAndMergeEpisodeAnalysis(((EnvironmentOptionOutcome) executeAction).episode);
            } else {
                episode.transition(action, stateHash2.s(), d2);
            }
            double d4 = (d2 + (d3 * d)) - q.q;
            boolean z = false;
            Iterator it = linkedList.iterator();
            while (it.hasNext()) {
                EligibilityTrace eligibilityTrace = (EligibilityTrace) it.next();
                if (eligibilityTrace.sh.equals(stateHash)) {
                    if (eligibilityTrace.q.a.equals(action)) {
                        z = true;
                        eligibilityTrace.eligibility = 1.0d;
                    } else {
                        eligibilityTrace.eligibility = 0.0d;
                    }
                }
                eligibilityTrace.q.q += this.learningRate.pollLearningRate(this.totalNumberOfSteps, eligibilityTrace.sh.s(), eligibilityTrace.q.a) * eligibilityTrace.eligibility * d4;
                eligibilityTrace.eligibility = eligibilityTrace.eligibility * this.lambda * d3;
                double abs = Math.abs(eligibilityTrace.initialQ - eligibilityTrace.q.q);
                if (abs > this.maxQChangeInLastEpisode) {
                    this.maxQChangeInLastEpisode = abs;
                }
            }
            if (!z) {
                q.q += this.learningRate.pollLearningRate(this.totalNumberOfSteps, q.s, q.a) * d4;
                EligibilityTrace eligibilityTrace2 = new EligibilityTrace(stateHash, q, this.lambda * d3);
                linkedList.add(eligibilityTrace2);
                double abs2 = Math.abs(eligibilityTrace2.initialQ - eligibilityTrace2.q.q);
                if (abs2 > this.maxQChangeInLastEpisode) {
                    this.maxQChangeInLastEpisode = abs2;
                }
            }
            stateHash = stateHash2;
            action = action2;
            q = q2;
            this.totalNumberOfSteps++;
        }
        return episode;
    }
}
