aima.core.learning.reinforcement.agent.PassiveTDAgent Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aima-core Show documentation
Show all versions of aima-core Show documentation
AIMA-Java Core Algorithms from the book Artificial Intelligence a Modern Approach 3rd Ed.
The newest version!
package aima.core.learning.reinforcement.agent;
import java.util.HashMap;
import java.util.Map;
import aima.core.agent.Action;
import aima.core.learning.reinforcement.PerceptStateReward;
import aima.core.util.FrequencyCounter;
/**
* Artificial Intelligence A Modern Approach (3rd Edition): page 837.
*
*
*
* function PASSIVE-TD-AGENT(percept) returns an action
* inputs: percept, a percept indicating the current state s' and reward signal r'
* persistent: π, a fixed policy
* U, a table of utilities, initially empty
* Ns, a table of frequencies for states, initially zero
* s,a,r, the previous state, action, and reward, initially null
*
* if s' is new then U[s'] <- r'
* if s is not null then
* increment Ns[s]
* U[s] <- U[s] + α(Ns[s])(r + γU[s'] - U[s])
* if s'.TERMINAL? then s,a,r <- null else s,a,r <- s',π[s'],r'
* return a
*
*
* Figure 21.4 A passive reinforcement learning agent that learns utility
* estimates using temporal differences. The step-size function α(n) is
* chosen to ensure convergence, as described in the text.
*
* @param
* the state type.
* @param
* the action type.
*
* @author Ciaran O'Reilly
* @author Ravi Mohan
*
*/
public class PassiveTDAgent extends
ReinforcementAgent {
// persistent: π, a fixed policy
private Map pi = new HashMap();
// U, a table of utilities, initially empty
private Map U = new HashMap();
// Ns, a table of frequencies for states, initially zero
private FrequencyCounter Ns = new FrequencyCounter();
// s,a,r, the previous state, action, and reward, initially null
private S s = null;
private A a = null;
private Double r = null;
//
private double alpha = 0.0;
private double gamma = 0.0;
/**
* Constructor.
*
* @param fixedPolicy
* π a fixed policy.
* @param alpha
* a fixed learning rate.
* @param gamma
* discount to be used.
*/
public PassiveTDAgent(Map fixedPolicy, double alpha, double gamma) {
this.pi.putAll(fixedPolicy);
this.alpha = alpha;
this.gamma = gamma;
}
/**
* Passive reinforcement learning that learns utility estimates using
* temporal differences
*
* @param percept
* a percept indicating the current state s' and reward signal
* r'.
* @return an action
*/
@Override
public A execute(PerceptStateReward percept) {
// if s' is new then U[s'] <- r'
S sDelta = percept.state();
double rDelta = percept.reward();
if (!U.containsKey(sDelta)) {
U.put(sDelta, rDelta);
}
// if s is not null then
if (null != s) {
// increment Ns[s]
Ns.incrementFor(s);
// U[s] <- U[s] + α(Ns[s])(r + γU[s'] - U[s])
double U_s = U.get(s);
U.put(s, U_s + alpha(Ns, s) * (r + gamma * U.get(sDelta) - U_s));
}
// if s'.TERMINAL? then s,a,r <- null else s,a,r <- s',π[s'],r'
if (isTerminal(sDelta)) {
s = null;
a = null;
r = null;
} else {
s = sDelta;
a = pi.get(sDelta);
r = rDelta;
}
// return a
return a;
}
@Override
public Map getUtility() {
return new HashMap(U);
}
@Override
public void reset() {
U = new HashMap();
Ns.clear();
s = null;
a = null;
r = null;
}
//
// PROTECTED METHODS
//
/**
* AIMA3e pg. 836 'if we change α from a fixed parameter to a function
* that decreases as the number of times a state has been visited increases,
* then Uπ(s) itself will converge to the correct value.
*
* Note: override this method to obtain the desired behavior.
*
* @param Ns
* a frequency counter of observed states.
* @param s
* the current state.
* @return the learning rate to use based on the frequency of the state
* passed in.
*/
protected double alpha(FrequencyCounter Ns, S s) {
// Default implementation is just to return a fixed parameter value
// irrespective of the # of times a state has been encountered
return alpha;
}
//
// PRIVATE METHODS
//
private boolean isTerminal(S s) {
boolean terminal = false;
Action a = pi.get(s);
if (null == a || a.isNoOp()) {
// No actions possible in state is considered terminal.
terminal = true;
}
return terminal;
}
}