All Downloads are FREE. Search and download functionalities are using the official Maven repository.

aima.core.learning.reinforcement.agent.PassiveADPAgent Maven / Gradle / Ivy

Go to download

AIMA-Java Core Algorithms from the book Artificial Intelligence a Modern Approach 3rd Ed.

The newest version!
package aima.core.learning.reinforcement.agent;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

import aima.core.agent.Action;
import aima.core.learning.reinforcement.PerceptStateReward;
import aima.core.probability.mdp.ActionsFunction;
import aima.core.probability.mdp.PolicyEvaluation;
import aima.core.probability.mdp.RewardFunction;
import aima.core.probability.mdp.TransitionProbabilityFunction;
import aima.core.probability.mdp.impl.MDP;
import aima.core.util.FrequencyCounter;
import aima.core.util.datastructure.Pair;

/**
 * Artificial Intelligence A Modern Approach (3rd Edition): page 834.
*
* *
 * function PASSIVE-ADP-AGENT(percept) returns an action
 *   inputs: percept, a percept indicating the current state s' and reward signal r'
 *   persistent: π, a fixed policy
 *               mdp, an MDP with model P, rewards R, discount γ
 *               U, a table of utilities, initially empty
 *               Nsa, a table of frequencies for state-action pairs, initially zero
 *               Ns'|sa, a table of outcome frequencies give state-action pairs, initially zero
 *               s, a, the previous state and action, initially null
 *               
 *   if s' is new then U[s'] <- r'; R[s'] <- r'
 *   if s is not null then
 *        increment Nsa[s,a] and Ns'|sa[s',s,a]
 *        for each t such that Ns'|sa[t,s,a] is nonzero do
 *            P(t|s,a) <-  Ns'|sa[t,s,a] / Nsa[s,a]
 *   U <- POLICY-EVALUATION(π, U, mdp)
 *   if s'.TERMINAL? then s,a <- null else s,a <- s',π[s']
 *   return a
 * 
* * Figure 21.2 A passive reinforcement learning agent based on adaptive dynamic * programming. The POLICY-EVALUATION function solves the fixed-policy Bellman * equations, as described on page 657. * * @param * the state type. * @param * the action type. * * @author Ciaran O'Reilly * @author Ravi Mohan * */ public class PassiveADPAgent extends ReinforcementAgent { // persistent: π, a fixed policy private Map pi = new HashMap(); // mdp, an MDP with model P, rewards R, discount γ private MDP mdp = null; private Map>, Double> P = new HashMap>, Double>(); private Map R = new HashMap(); private PolicyEvaluation policyEvaluation = null; // U, a table of utilities, initially empty private Map U = new HashMap(); // Nsa, a table of frequencies for state-action pairs, initially // zero private FrequencyCounter> Nsa = new FrequencyCounter>(); // Ns'|sa, a table of outcome frequencies give state-action // pairs, initially zero private FrequencyCounter>> NsDelta_sa = new FrequencyCounter>>(); // s, a, the previous state and action, initially null private S s = null; private A a = null; /** * Constructor. * * @param fixedPolicy * π a fixed policy. * @param states * the possible states in the world (i.e. fully observable). * @param initialState * the initial state for the agent. * @param actionsFunction * a function that lists the legal actions from a state. * @param policyEvaluation * a function for evaluating a policy. */ public PassiveADPAgent(Map fixedPolicy, Set states, S initialState, ActionsFunction actionsFunction, PolicyEvaluation policyEvaluation) { this.pi.putAll(fixedPolicy); this.mdp = new MDP(states, initialState, actionsFunction, new TransitionProbabilityFunction() { public double probability(S sDelta, S s, A a) { Double p = P.get(new Pair>(sDelta, new Pair(s, a))); return null == p ? 0.0 : p.doubleValue(); } }, new RewardFunction() { public double reward(S s) { return R.get(s); } }); this.policyEvaluation = policyEvaluation; } /** * Passive reinforcement learning based on adaptive dynamic programming. * * @param percept * a percept indicating the current state s' and reward signal * r'. * @return an action */ @Override public A execute(PerceptStateReward percept) { // if s' is new then U[s'] <- r'; R[s'] <- r' S sDelta = percept.state(); double rDelta = percept.reward(); if (!U.containsKey(sDelta)) { U.put(sDelta, rDelta); R.put(sDelta, rDelta); } // if s is not null then if (null != s) { // increment Nsa[s,a] and Ns'|sa[s',s,a] Pair sa = new Pair(s, a); Nsa.incrementFor(sa); NsDelta_sa.incrementFor(new Pair>(sDelta, sa)); // for each t such that Ns'|sa[t,s,a] is nonzero do for (S t : mdp.states()) { Pair> t_sa = new Pair>(t, sa); if (0 != NsDelta_sa.getCount(t_sa)) { // P(t|s,a) <- Ns'|sa[t,s,a] / // Nsa[s,a] P.put(t_sa, NsDelta_sa.getCount(t_sa).doubleValue() / Nsa.getCount(sa).doubleValue()); } } } // U <- POLICY-EVALUATION(π, U, mdp) U = policyEvaluation.evaluate(pi, U, mdp); // if s'.TERMINAL? then s,a <- null else s,a <- s',π[s'] if (isTerminal(sDelta)) { s = null; a = null; } else { s = sDelta; a = pi.get(sDelta); } // return a return a; } @Override public Map getUtility() { return Collections.unmodifiableMap(U); } @Override public void reset() { P.clear(); R.clear(); U = new HashMap(); Nsa.clear(); NsDelta_sa.clear(); s = null; a = null; } // // PRIVATE METHODS // private boolean isTerminal(S s) { boolean terminal = false; if (0 == mdp.actions(s).size()) { // No actions possible in state is considered terminal. terminal = true; } return terminal; } }