aima.core.learning.reinforcement.agent.PassiveADPAgent Maven / Gradle / Ivy

package aima.core.learning.reinforcement.agent;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import aima.core.agent.Action;
import aima.core.learning.reinforcement.PerceptStateReward;
import aima.core.probability.mdp.ActionsFunction;
import aima.core.probability.mdp.PolicyEvaluation;
import aima.core.probability.mdp.RewardFunction;
import aima.core.probability.mdp.TransitionProbabilityFunction;
import aima.core.probability.mdp.impl.MDP;
import aima.core.util.FrequencyCounter;
import aima.core.util.datastructure.Pair;

/**
 * Artificial Intelligence A Modern Approach (3rd Edition): page 834.

 * 

 * 
 *  * function PASSIVE-ADP-AGENT(percept) returns an action
 *   inputs: percept, a percept indicating the current state s' and reward signal r'
 *   persistent: π, a fixed policy
 *               mdp, an MDP with model P, rewards R, discount γ
 *               U, a table of utilities, initially empty
 *               N_sa, a table of frequencies for state-action pairs, initially zero
 *               N_s'|sa, a table of outcome frequencies give state-action pairs, initially zero
 *               s, a, the previous state and action, initially null
 *               
 *   if s' is new then U[s'] <- r'; R[s'] <- r'
 *   if s is not null then
 *        increment N_sa[s,a] and N_s'|sa[s',s,a]
 *        for each t such that N_s'|sa[t,s,a] is nonzero do
 *            P(t|s,a) <-  N_s'|sa[t,s,a] / N_sa[s,a]
 *   U <- POLICY-EVALUATION(π, U, mdp)
 *   if s'.TERMINAL? then s,a <- null else s,a <- s',π[s']
 *   return a
 * 
 * 
 * Figure 21.2 A passive reinforcement learning agent based on adaptive dynamic
 * programming. The POLICY-EVALUATION function solves the fixed-policy Bellman
 * equations, as described on page 657.
 * 
 * @param 
 *            the state type.
 * @param 
 *            the action type.
 * 
 * @author Ciaran O'Reilly
 * @author Ravi Mohan
 * 
 */
public class PassiveADPAgent extends
		ReinforcementAgent {
	// persistent: π, a fixed policy
	private Map pi = new HashMap();
	// mdp, an MDP with model P, rewards R, discount γ
	private MDP mdp = null;
	private Map>, Double> P = new HashMap>, Double>();
	private Map R = new HashMap();
	private PolicyEvaluation policyEvaluation = null;
	// U, a table of utilities, initially empty
	private Map U = new HashMap();
	// N_sa, a table of frequencies for state-action pairs, initially
	// zero
	private FrequencyCounter> Nsa = new FrequencyCounter>();
	// N_s'|sa, a table of outcome frequencies give state-action
	// pairs, initially zero
	private FrequencyCounter>> NsDelta_sa = new FrequencyCounter>>();
	// s, a, the previous state and action, initially null
	private S s = null;
	private A a = null;

	/**
	 * Constructor.
	 * 
	 * @param fixedPolicy
	 *            π a fixed policy.
	 * @param states
	 *            the possible states in the world (i.e. fully observable).
	 * @param initialState
	 *            the initial state for the agent.
	 * @param actionsFunction
	 *            a function that lists the legal actions from a state.
	 * @param policyEvaluation
	 *            a function for evaluating a policy.
	 */
	public PassiveADPAgent(Map fixedPolicy, Set states,
			S initialState, ActionsFunction actionsFunction,
			PolicyEvaluation policyEvaluation) {
		this.pi.putAll(fixedPolicy);
		this.mdp = new MDP(states, initialState, actionsFunction,
				new TransitionProbabilityFunction() {
					public double probability(S sDelta, S s, A a) {
						Double p = P.get(new Pair>(sDelta,
								new Pair(s, a)));

						return null == p ? 0.0 : p.doubleValue();
					}
				}, new RewardFunction() {
					public double reward(S s) {
						return R.get(s);
					}
				});
		this.policyEvaluation = policyEvaluation;
	}

	/**
	 * Passive reinforcement learning based on adaptive dynamic programming.
	 * 
	 * @param percept
	 *            a percept indicating the current state s' and reward signal
	 *            r'.
	 * @return an action
	 */
	@Override
	public A execute(PerceptStateReward percept) {
		// if s' is new then U[s'] <- r'; R[s'] <- r'
		S sDelta = percept.state();
		double rDelta = percept.reward();
		if (!U.containsKey(sDelta)) {
			U.put(sDelta, rDelta);
			R.put(sDelta, rDelta);
		}
		// if s is not null then
		if (null != s) {
			// increment N_sa[s,a] and N_s'|sa[s',s,a]
			Pair sa = new Pair(s, a);
			Nsa.incrementFor(sa);
			NsDelta_sa.incrementFor(new Pair>(sDelta, sa));
			// for each t such that N_s'|sa[t,s,a] is nonzero do
			for (S t : mdp.states()) {
				Pair> t_sa = new Pair>(t, sa);
				if (0 != NsDelta_sa.getCount(t_sa)) {
					// P(t|s,a) <- N_s'|sa[t,s,a] /
					// N_sa[s,a]
					P.put(t_sa, NsDelta_sa.getCount(t_sa).doubleValue()
							/ Nsa.getCount(sa).doubleValue());
				}
			}
		}
		// U <- POLICY-EVALUATION(π, U, mdp)
		U = policyEvaluation.evaluate(pi, U, mdp);
		// if s'.TERMINAL? then s,a <- null else s,a <- s',π[s']
		if (isTerminal(sDelta)) {
			s = null;
			a = null;

		} else {
			s = sDelta;
			a = pi.get(sDelta);
		}
		// return a
		return a;
	}

	@Override
	public Map getUtility() {
		return Collections.unmodifiableMap(U);
	}

	@Override
	public void reset() {
		P.clear();
		R.clear();
		U = new HashMap();
		Nsa.clear();
		NsDelta_sa.clear();
		s = null;
		a = null;
	}

	//
	// PRIVATE METHODS
	//
	private boolean isTerminal(S s) {
		boolean terminal = false;
		if (0 == mdp.actions(s).size()) {
			// No actions possible in state is considered terminal.
			terminal = true;
		}
		return terminal;
	}
}