aima.core.learning.reinforcement.agent.PassiveADPAgent Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aima-core Show documentation
Show all versions of aima-core Show documentation
AIMA-Java Core Algorithms from the book Artificial Intelligence a Modern Approach 3rd Ed.
The newest version!
package aima.core.learning.reinforcement.agent;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import aima.core.agent.Action;
import aima.core.learning.reinforcement.PerceptStateReward;
import aima.core.probability.mdp.ActionsFunction;
import aima.core.probability.mdp.PolicyEvaluation;
import aima.core.probability.mdp.RewardFunction;
import aima.core.probability.mdp.TransitionProbabilityFunction;
import aima.core.probability.mdp.impl.MDP;
import aima.core.util.FrequencyCounter;
import aima.core.util.datastructure.Pair;
/**
* Artificial Intelligence A Modern Approach (3rd Edition): page 834.
*
*
*
* function PASSIVE-ADP-AGENT(percept) returns an action
* inputs: percept, a percept indicating the current state s' and reward signal r'
* persistent: π, a fixed policy
* mdp, an MDP with model P, rewards R, discount γ
* U, a table of utilities, initially empty
* Nsa, a table of frequencies for state-action pairs, initially zero
* Ns'|sa, a table of outcome frequencies give state-action pairs, initially zero
* s, a, the previous state and action, initially null
*
* if s' is new then U[s'] <- r'; R[s'] <- r'
* if s is not null then
* increment Nsa[s,a] and Ns'|sa[s',s,a]
* for each t such that Ns'|sa[t,s,a] is nonzero do
* P(t|s,a) <- Ns'|sa[t,s,a] / Nsa[s,a]
* U <- POLICY-EVALUATION(π, U, mdp)
* if s'.TERMINAL? then s,a <- null else s,a <- s',π[s']
* return a
*
*
* Figure 21.2 A passive reinforcement learning agent based on adaptive dynamic
* programming. The POLICY-EVALUATION function solves the fixed-policy Bellman
* equations, as described on page 657.
*
* @param
* the state type.
* @param
* the action type.
*
* @author Ciaran O'Reilly
* @author Ravi Mohan
*
*/
public class PassiveADPAgent extends
ReinforcementAgent {
// persistent: π, a fixed policy
private Map pi = new HashMap();
// mdp, an MDP with model P, rewards R, discount γ
private MDP mdp = null;
private Map>, Double> P = new HashMap>, Double>();
private Map R = new HashMap();
private PolicyEvaluation policyEvaluation = null;
// U, a table of utilities, initially empty
private Map U = new HashMap();
// Nsa, a table of frequencies for state-action pairs, initially
// zero
private FrequencyCounter> Nsa = new FrequencyCounter>();
// Ns'|sa, a table of outcome frequencies give state-action
// pairs, initially zero
private FrequencyCounter>> NsDelta_sa = new FrequencyCounter>>();
// s, a, the previous state and action, initially null
private S s = null;
private A a = null;
/**
* Constructor.
*
* @param fixedPolicy
* π a fixed policy.
* @param states
* the possible states in the world (i.e. fully observable).
* @param initialState
* the initial state for the agent.
* @param actionsFunction
* a function that lists the legal actions from a state.
* @param policyEvaluation
* a function for evaluating a policy.
*/
public PassiveADPAgent(Map fixedPolicy, Set states,
S initialState, ActionsFunction actionsFunction,
PolicyEvaluation policyEvaluation) {
this.pi.putAll(fixedPolicy);
this.mdp = new MDP(states, initialState, actionsFunction,
new TransitionProbabilityFunction() {
public double probability(S sDelta, S s, A a) {
Double p = P.get(new Pair>(sDelta,
new Pair(s, a)));
return null == p ? 0.0 : p.doubleValue();
}
}, new RewardFunction() {
public double reward(S s) {
return R.get(s);
}
});
this.policyEvaluation = policyEvaluation;
}
/**
* Passive reinforcement learning based on adaptive dynamic programming.
*
* @param percept
* a percept indicating the current state s' and reward signal
* r'.
* @return an action
*/
@Override
public A execute(PerceptStateReward percept) {
// if s' is new then U[s'] <- r'; R[s'] <- r'
S sDelta = percept.state();
double rDelta = percept.reward();
if (!U.containsKey(sDelta)) {
U.put(sDelta, rDelta);
R.put(sDelta, rDelta);
}
// if s is not null then
if (null != s) {
// increment Nsa[s,a] and Ns'|sa[s',s,a]
Pair sa = new Pair(s, a);
Nsa.incrementFor(sa);
NsDelta_sa.incrementFor(new Pair>(sDelta, sa));
// for each t such that Ns'|sa[t,s,a] is nonzero do
for (S t : mdp.states()) {
Pair> t_sa = new Pair>(t, sa);
if (0 != NsDelta_sa.getCount(t_sa)) {
// P(t|s,a) <- Ns'|sa[t,s,a] /
// Nsa[s,a]
P.put(t_sa, NsDelta_sa.getCount(t_sa).doubleValue()
/ Nsa.getCount(sa).doubleValue());
}
}
}
// U <- POLICY-EVALUATION(π, U, mdp)
U = policyEvaluation.evaluate(pi, U, mdp);
// if s'.TERMINAL? then s,a <- null else s,a <- s',π[s']
if (isTerminal(sDelta)) {
s = null;
a = null;
} else {
s = sDelta;
a = pi.get(sDelta);
}
// return a
return a;
}
@Override
public Map getUtility() {
return Collections.unmodifiableMap(U);
}
@Override
public void reset() {
P.clear();
R.clear();
U = new HashMap();
Nsa.clear();
NsDelta_sa.clear();
s = null;
a = null;
}
//
// PRIVATE METHODS
//
private boolean isTerminal(S s) {
boolean terminal = false;
if (0 == mdp.actions(s).size()) {
// No actions possible in state is considered terminal.
terminal = true;
}
return terminal;
}
}