package burlap.behavior.singleagent.learning.actorcritic.critics;

import burlap.behavior.singleagent.learning.actorcritic.critics.TDLambda;
import burlap.behavior.singleagent.options.EnvironmentOptionOutcome;
import burlap.behavior.singleagent.options.Option;
import burlap.behavior.valuefunction.ValueFunction;
import burlap.mdp.core.TerminalFunction;
import burlap.mdp.core.state.State;
import burlap.mdp.singleagent.environment.EnvironmentOutcome;
import burlap.mdp.singleagent.model.RewardFunction;
import burlap.statehashing.HashableState;
import burlap.statehashing.HashableStateFactory;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

/* loaded from: input_file:burlap/behavior/singleagent/learning/actorcritic/critics/TimeIndexedTDLambda.class */
public class TimeIndexedTDLambda extends TDLambda {
    protected List<Map<HashableState, TDLambda.VValue>> vTIndex;
    protected int curTime;
    protected int maxEpisodeSize;

    /* loaded from: input_file:burlap/behavior/singleagent/learning/actorcritic/critics/TimeIndexedTDLambda$StateTimeElibilityTrace.class */
    public static class StateTimeElibilityTrace extends TDLambda.StateEligibilityTrace {
        public int timeIndex;

        public StateTimeElibilityTrace(HashableState hashableState, int i, double d, TDLambda.VValue vValue) {
            super(hashableState, d, vValue);
            this.timeIndex = i;
        }
    }

    public TimeIndexedTDLambda(double d, HashableStateFactory hashableStateFactory, double d2, double d3, double d4) {
        super(d, hashableStateFactory, d2, d3, d4);
        this.maxEpisodeSize = Integer.MAX_VALUE;
        this.vTIndex = new ArrayList();
    }

    public TimeIndexedTDLambda(RewardFunction rewardFunction, TerminalFunction terminalFunction, double d, HashableStateFactory hashableStateFactory, double d2, double d3, double d4, int i) {
        super(d, hashableStateFactory, d2, d3, d4);
        this.maxEpisodeSize = Integer.MAX_VALUE;
        this.maxEpisodeSize = i;
        this.vTIndex = new ArrayList();
    }

    public TimeIndexedTDLambda(double d, HashableStateFactory hashableStateFactory, double d2, ValueFunction valueFunction, double d3, int i) {
        super(d, hashableStateFactory, d2, valueFunction, d3);
        this.maxEpisodeSize = Integer.MAX_VALUE;
        this.maxEpisodeSize = i;
        this.vTIndex = new ArrayList();
    }

    public int getCurTime() {
        return this.curTime;
    }

    public void setCurTime(int i) {
        this.curTime = i;
    }

    @Override // burlap.behavior.singleagent.learning.actorcritic.critics.TDLambda, burlap.behavior.singleagent.learning.actorcritic.Critic
    public void startEpisode(State state) {
        super.startEpisode(state);
        this.curTime = 0;
    }

    @Override // burlap.behavior.singleagent.learning.actorcritic.critics.TDLambda, burlap.behavior.singleagent.learning.actorcritic.Critic
    public void endEpisode() {
        super.endEpisode();
    }

    @Override // burlap.behavior.singleagent.learning.actorcritic.critics.TDLambda, burlap.behavior.singleagent.learning.actorcritic.Critic
    public double critique(EnvironmentOutcome environmentOutcome) {
        HashableState hashState = this.hashingFactory.hashState(environmentOutcome.o);
        HashableState hashState2 = this.hashingFactory.hashState(environmentOutcome.op);
        double d = environmentOutcome.r;
        double d2 = this.gamma;
        int i = 1;
        if (environmentOutcome.a instanceof Option) {
            i = ((EnvironmentOptionOutcome) environmentOutcome).numSteps();
            d2 = Math.pow(this.gamma, i);
        }
        TDLambda.VValue v = getV(hashState, this.curTime);
        double d3 = 0.0d;
        if (!environmentOutcome.terminated && this.curTime < this.maxEpisodeSize - 1) {
            d3 = getV(hashState2, this.curTime + i).v;
        }
        double d4 = (d + (d2 * d3)) - v.v;
        Iterator<TDLambda.StateEligibilityTrace> it = this.traces.iterator();
        while (it.hasNext()) {
            TDLambda.StateEligibilityTrace next = it.next();
            next.v.v += this.learningRate.pollLearningRate(this.totalNumberOfSteps, next.sh.s(), null) * d4 * next.eligibility;
            next.eligibility = next.eligibility * this.lambda * d2;
        }
        v.v += this.learningRate.pollLearningRate(this.totalNumberOfSteps, hashState.s(), null) * d4;
        this.traces.add(new StateTimeElibilityTrace(hashState, this.curTime, d2 * this.lambda, v));
        this.curTime += i;
        this.totalNumberOfSteps++;
        return d4;
    }

    protected TDLambda.VValue getV(HashableState hashableState, int i) {
        while (this.vTIndex.size() <= i) {
            this.vTIndex.add(new HashMap());
        }
        Map<HashableState, TDLambda.VValue> map = this.vTIndex.get(i);
        TDLambda.VValue vValue = map.get(hashableState);
        if (vValue == null) {
            vValue = new TDLambda.VValue(this.vInitFunction.value(hashableState.s()));
            map.put(hashableState, vValue);
        }
        return vValue;
    }

    @Override // burlap.behavior.singleagent.learning.actorcritic.critics.TDLambda, burlap.behavior.singleagent.learning.actorcritic.Critic
    public void reset() {
        super.reset();
        this.vTIndex.clear();
    }
}
