366 lines
15 KiB
Python
Executable File
366 lines
15 KiB
Python
Executable File
"""
|
|
q_agent.py
|
|
This submodule contains the Agent class, which implements a Q-learning agent with eligibility traces (TD-lambda). The agent learns to make decisions based on its sensory state and rewards received from the environment. The agent uses an epsilon-greedy action-selection strategy.
|
|
|
|
Usage:
|
|
import q_agent
|
|
|
|
Class:
|
|
Agent(obsSpace, actSpace, alpha=0.1, gamma=0.95, epsilon=0.01, lmbda=0.96)
|
|
|
|
Attributes:
|
|
obsSpace (tuple): The shape of the observation space.
|
|
actSpace (tuple): The shape of the action space.
|
|
ftrSpace (tuple): The shape of the feature space.
|
|
n_features (int): The total number of features.
|
|
n_actions (int): The total number of actions.
|
|
weights (numpy.ndarray): The Q-function weights.
|
|
trace (numpy.ndarray): The eligibility trace for each feature.
|
|
featureToIndexMap (numpy.ndarray): A mapping from feature indices to the corresponding weights.
|
|
allActions (list): A list of all possible actions.
|
|
alpha (float): The learning rate for updating weights.
|
|
gamma (float): The discount factor for future rewards.
|
|
epsilon (float): The exploration rate for epsilon-greedy action selection.
|
|
lmbda (float): The decay factor for eligibility traces.
|
|
sensoryState (numpy.ndarray): The current sensory state of the agent.
|
|
previousSensoryState (numpy.ndarray): The previous sensory state of the agent.
|
|
action (int): The current action taken by the agent.
|
|
previousAction (int): The previous action taken by the agent.
|
|
episoden (int): The episode number the agent is in.
|
|
recentReset (bool): Indicates if the agent was recently reset.
|
|
|
|
Methods:
|
|
reset():
|
|
Resets the agent's traces, sensory states, and actions.
|
|
|
|
predictPayoffsForAllActions() -> List[float]:
|
|
Predicts the expected payoffs for all possible actions given the current sensory state.
|
|
|
|
plasticUpdate():
|
|
Updates the agent's Q-function weights and eligibility traces based on the current sensory state, action, and received reward. Uses epsilon-greedy action selection.
|
|
|
|
staticUpdate():
|
|
Updates the agent's action based on the current sensory state without updating weights or traces. Uses greedy action selection.
|
|
|
|
Examples:
|
|
>>> from q_agent import Agent
|
|
>>> obsSpace, actSpace = (2, 2), (3,)
|
|
>>> agent = Agent(obsSpace=obsSpace, actSpace=actSpace)
|
|
"""
|
|
|
|
import traceback
|
|
|
|
import numpy as np
|
|
from collections import defaultdict
|
|
from typing import List, Tuple, Union
|
|
|
|
from deap import creator, base, tools, algorithms
|
|
|
|
LOGGING = False
|
|
|
|
import logging, sys
|
|
logging.basicConfig(stream=sys.stdout,level=logging.INFO)
|
|
log = logging.getLogger()
|
|
|
|
if not LOGGING:
|
|
# remove all logging functionality
|
|
for handler in log.handlers.copy():
|
|
try:
|
|
log.removeHandler(handler)
|
|
except ValueError: # in case another thread has already removed it
|
|
pass
|
|
log.addHandler(logging.NullHandler())
|
|
log.propagate = False
|
|
|
|
|
|
# The Agent class, similar to what
|
|
# is used in MABE. Note: this is unlike
|
|
# how standard RLML folks structure these
|
|
# algorithms. Here, we separate out concerns
|
|
# for modularity. A side-effect is that the
|
|
# update() (one cognitive step) receives the reward
|
|
# for the previous update-action. This means 1 extra
|
|
# update must be called if terminating.
|
|
class Agent():
|
|
|
|
|
|
def __init__(i, obsSpace, actSpace, alpha=0.1, gamma=0.95, epsilon=0.01, lmbda=0.96):
|
|
i.obsSpace = np.array(obsSpace)
|
|
i.actSpace = np.array(actSpace)
|
|
i.ftrSpace = tuple(obsSpace)+tuple(actSpace)
|
|
i.n_features = np.prod(i.ftrSpace)
|
|
i.n_actions = actSpace[0] # not general
|
|
i.weights = np.zeros(i.n_features)
|
|
i.trace = np.zeros(i.n_features)
|
|
i.featureToIndexMap = np.arange(i.n_features).reshape(i.ftrSpace)
|
|
i.allActions = list(range(i.n_actions))
|
|
# new
|
|
i.alpha = alpha # how much to weigh reward surprises that deviate from expectation
|
|
i.gamma = gamma # how important exepcted rewards will be
|
|
i.epsilon = epsilon # fraction of exploration to exploitation (how often to choose a random action)
|
|
i.lmbda = lmbda # how important preceeding actions are in learning adaptation
|
|
i.sensoryState = np.zeros(len(i.obsSpace),dtype=np.int32)
|
|
i.previousSensoryState = np.zeros(len(i.obsSpace),dtype=np.int32)
|
|
i.action = 0
|
|
i.previousAction = 0
|
|
i.episoden = 0
|
|
i.recentReset = True
|
|
|
|
|
|
def reset(i): # only resets traces
|
|
log.info("resetting agent")
|
|
i.trace = np.zeros(i.n_features)
|
|
i.sensoryState = np.zeros(len(i.obsSpace),dtype=np.int32)
|
|
i.previousSensoryState = np.zeros(len(i.obsSpace),dtype=np.int32)
|
|
i.action = 0
|
|
i.previousAction = 0
|
|
i.reward = -1
|
|
i.recentReset = True
|
|
|
|
|
|
def predictPayoffsForAllActions(i) -> List[float]:
|
|
'''combines current sensoryState and all possible actions to return all possible payoffs by action
|
|
>>> obsSpace, actSpace, ftrSpace = (2,2), (3,), (2,2)+(3,)
|
|
>>> i = Agent(obsSpace=obsSpace, actSpace=actSpace)
|
|
>>> (i.featureToIndexMap == np.arange(i.n_features).reshape((2,2,3))).all()
|
|
True
|
|
>>> i.sensoryState[:] = [1,0]
|
|
>>> i.weights = np.zeros(12)
|
|
>>> i.weights[6:9] = [1.,2.,3.] # weights associated with features (1,0,<action>) with actions 0,1,2
|
|
>>> i.predictPayoffsForAllActions()
|
|
[1.0, 2.0, 3.0]
|
|
'''
|
|
#print(i.sensoryState, i.allActions)
|
|
try:
|
|
featureKeys = [tuple(i.sensoryState)+(action,) for action in i.allActions]
|
|
# featuresForEachAction = [i.featureToIndexMap[tuple(i.sensoryState)+(action,)] for action in i.allActions]
|
|
featuresForEachAction = [i.featureToIndexMap[fki] for fki in featureKeys]
|
|
#print('featureToIndexMap', i.featureToIndexMap)
|
|
#print('featureKeys', featureKeys)
|
|
#print('sensoryState', i.sensoryState, 'allActions', i.allActions)
|
|
return [i.weights[features].sum() for features in featuresForEachAction]
|
|
except:
|
|
estr = f"Error: {traceback.format_exc()}"
|
|
print(estr)
|
|
print('featureToIndexMap', i.featureToIndexMap)
|
|
print('featureKeys', featureKeys)
|
|
print('sensoryState', i.sensoryState, 'allActions', i.allActions)
|
|
return [np.nan for x in range(len(i.allActions))]
|
|
|
|
|
|
|
|
def plasticUpdate(i):
|
|
# This algorithm is a TD-lambda algorithm
|
|
# with epsilon-greedy action-selection
|
|
# (could use annealing of the epsilon - I removed it again)
|
|
|
|
# determine predicted payoff
|
|
nextActionPredictedPayoff = 0.0 # used to find surprise between expected and received payoff
|
|
nextAction = 0
|
|
# epsilon-greedy action-selection
|
|
# choose random
|
|
if np.random.random() < i.epsilon: # random
|
|
nextAction = np.random.choice(i.n_actions)
|
|
else: # choose best
|
|
try:
|
|
q_vals = i.predictPayoffsForAllActions()
|
|
nextAction = np.argmax(q_vals)
|
|
if i.reward >= 0.0: # goal achieved
|
|
nextActionPredictedPayoff = 0.0
|
|
else:
|
|
nextActionPredictedPayoff = q_vals[nextAction]
|
|
except:
|
|
estr = f"Error: {traceback.format_exc()}"
|
|
print(estr)
|
|
print("q_vals", q_vals)
|
|
# only update weights if accumulated at least 1 experience
|
|
if not i.recentReset:
|
|
# determine the corrected payoff version given the reward actually received
|
|
previousActionCorrectedPayoff = i.reward + (nextActionPredictedPayoff * i.gamma)
|
|
# use this information to update weights for last action-selection based on how surprised we were
|
|
features = i.featureToIndexMap[tuple(i.previousSensoryState)+(i.action,)]
|
|
previousActionPredictedPayoff = i.weights[features].sum()
|
|
surprise = previousActionCorrectedPayoff - previousActionPredictedPayoff
|
|
# do weight updates
|
|
i.trace[features] = 1.0
|
|
# do trace updates
|
|
i.weights += i.alpha * surprise * i.trace
|
|
i.trace *= i.lmbda
|
|
# keep track of state and action t, t-1
|
|
i.previousSensoryState = i.sensoryState[:]
|
|
i.action = nextAction
|
|
i.recentReset = False
|
|
|
|
|
|
def staticUpdate(i):
|
|
# same as plasticUpdate, but without learning
|
|
# (a.k.a. 'deployment')
|
|
|
|
# determine predicted payoff
|
|
nextActionPredictedPayoff = 0.0 # used to find surprise between expected and received payoff
|
|
nextAction = 0
|
|
# greedy action-selection
|
|
q_vals = i.predictPayoffsForAllActions()
|
|
nextAction = np.argmax(q_vals)
|
|
# step the storage of state and action in memory
|
|
i.previousSensoryState = i.sensoryState[:]
|
|
i.action = nextAction
|
|
|
|
|
|
"""
|
|
This derived class adds a mutation_rate attribute, as well as methods for mutation, crossover, and fitness handling. You can then use an evolutionary algorithm to evolve a population of EvolvableAgent instances by applying selection, crossover, and mutation operations based on the agents' fitness values.
|
|
"""
|
|
|
|
def tuple_shape(input_tuple):
|
|
if not isinstance(input_tuple, tuple):
|
|
try:
|
|
return input_tuple.shape
|
|
except:
|
|
raise TypeError("Input must be a tuple")
|
|
|
|
# Check if the tuple is nested (i.e., if it's a multidimensional tuple)
|
|
if any(isinstance(item, tuple) for item in input_tuple):
|
|
shape = []
|
|
while isinstance(input_tuple, tuple):
|
|
shape.append(len(input_tuple))
|
|
input_tuple = input_tuple[0]
|
|
return tuple(shape)
|
|
else:
|
|
return (len(input_tuple),)
|
|
|
|
class Holder(object):
|
|
def __init__(self):
|
|
pass
|
|
|
|
class EvolvableAgent(Agent):
|
|
""" EvolvableAgent
|
|
This class extends the Agent class from q_agent.py, adding functionality for evolutionary computation. The EvolvableAgent class can be used with evolutionary algorithms to optimize the agent's performance through mutation, crossover, and selection based on fitness values.
|
|
|
|
Usage:
|
|
import EvolvableAgent
|
|
|
|
Class:
|
|
EvolvableAgent(obsSpace, actSpace, alpha=0.1, gamma=0.95, epsilon=0.01, lmbda=0.96, mutation_rate=0.05)
|
|
|
|
Attributes (in addition to Agent attributes):
|
|
mutation_rate (float): The probability of each weight being mutated during mutation.
|
|
fitness (float): The fitness value of the agent, used for evaluation and selection in an evolutionary algorithm.
|
|
|
|
Methods (in addition to Agent methods):
|
|
mutate():
|
|
Mutates the agent's weights by adding small random values, drawn from a normal distribution. The mutation_rate attribute determines the probability of each weight being mutated.
|
|
|
|
csharp
|
|
Copy code
|
|
crossover(other: 'EvolvableAgent') -> 'EvolvableAgent':
|
|
Performs uniform crossover between this agent and another agent, creating a new offspring agent.
|
|
Args:
|
|
other (EvolvableAgent): The other agent to perform crossover with.
|
|
Returns:
|
|
EvolvableAgent: The offspring agent resulting from the crossover.
|
|
|
|
set_fitness(fitness: float):
|
|
Sets the fitness value for the agent.
|
|
Args:
|
|
fitness (float): The fitness value to be set.
|
|
|
|
get_fitness() -> float:
|
|
Gets the fitness value of the agent.
|
|
Returns:
|
|
float: The fitness value of the agent.
|
|
Examples:
|
|
>>> from EvolvableAgent import EvolvableAgent
|
|
>>> obsSpace, actSpace = (2, 2), (3,)
|
|
>>> agent = EvolvableAgent(obsSpace=obsSpace, actSpace=actSpace, mutation_rate=0.05)
|
|
"""
|
|
def __init__(self, obsSpace, actSpace, alpha=0.1, gamma=0.95, epsilon=0.01, lmbda=0.96, \
|
|
mutation_rate=0.05, crossover_rate=0.01, fitness=None):
|
|
# obsSpace, actSpace, alpha=0.1, gamma=0.95, epsilon=0.01, lmbda=0.96
|
|
super().__init__(obsSpace, actSpace, alpha, gamma, epsilon, lmbda)
|
|
self.germline = self.weights
|
|
self.mutation_rate = mutation_rate
|
|
self.crossover_rate = crossover_rate
|
|
self.wfitness = None
|
|
self.fitness = fitness
|
|
self.init_fitness = fitness
|
|
|
|
def mutate(self):
|
|
"""
|
|
Mutate the agent's weights by adding small random values, drawn from a normal distribution.
|
|
The mutation_rate attribute determines the probability of each weight being mutated.
|
|
"""
|
|
wtshape = self.weights.shape
|
|
glshape = self.germline.shape
|
|
mutation_mask = np.random.random(self.germline.shape) < self.mutation_rate
|
|
self.germline[mutation_mask] += np.random.normal(loc=0, scale=0.01, size=np.sum(mutation_mask))
|
|
self.weights = self.germline
|
|
assert glshape == self.germline.shape, "Error: mutate() germline shape has changed"
|
|
assert wtshape == self.weights.shape, "Error: mutate() weights shape has changed"
|
|
|
|
def crossover(self, other: 'EvolvableAgent') -> 'EvolvableAgent':
|
|
"""
|
|
Perform uniform crossover between this agent and another agent, creating a new offspring agent.
|
|
Args:
|
|
other (EvolvableAgent): The other agent to perform crossover with.
|
|
Returns:
|
|
EvolvableAgent: The offspring agent resulting from the crossover.
|
|
"""
|
|
wtshape = self.weights.shape
|
|
glshape = self.germline.shape
|
|
offspring = EvolvableAgent(self.obsSpace, self.actSpace, self.alpha, self.gamma, self.epsilon, self.lmbda, self.mutation_rate, self.crossover_rate, self.init_fitness4)
|
|
if np.random.random() <= self.crossover_rate:
|
|
crossover_mask = np.random.randint(0, 2, size=self.germline.shape, dtype=bool)
|
|
offspring.germline = np.where(crossover_mask, self.germline, other.germline)
|
|
else:
|
|
offspring.germline = self.germline
|
|
offspring.weights = offspring.germline
|
|
assert self.obsSpace.shape == offspring.obsSpace.shape, f"Error: offspring has different obsSpace {offspring.obsSpace} != {self.obsSpace}"
|
|
assert self.actSpace.shape == offspring.actSpace.shape, f"Error: offspring has different actSpace {offspring.actSpace} != {self.actSpace}"
|
|
assert tuple_shape(self.ftrSpace) == tuple_shape(offspring.ftrSpace), f"Error: offspring had different ftrSpace {offspring.ftrSpace} {offspring.obsSpace} {offspring.actSpace} != {self.ftrSpace} {self.obsSpace} {self.actSpace}"
|
|
assert glshape == offspring.germline.shape, "Error: offspring germline shape has changed"
|
|
assert wtshape == offspring.weights.shape, "Error: offspring weights shape has changed"
|
|
return offspring
|
|
|
|
def set_wfitness(self, fitness: float):
|
|
"""
|
|
Set the fitnevss value for the agent.
|
|
Args:
|
|
fitness (float): The fitness value to be set.
|
|
"""
|
|
self.wfitness = fitness
|
|
|
|
def get_wfitness(self) -> float:
|
|
"""
|
|
Get the fitness value of the agent.
|
|
Returns:
|
|
float: The fitness value of the agent.
|
|
"""
|
|
return self.wfitness
|
|
|
|
def set_fitness(self, fitness: float):
|
|
"""
|
|
Set the fitness value for the agent.
|
|
Args:
|
|
fitness (float): The fitness value to be set.
|
|
"""
|
|
self.fitness.values = (fitness,)
|
|
|
|
def get_fitness(self) -> float:
|
|
"""
|
|
Get the fitness value of the agent.
|
|
Returns:
|
|
float: The fitness value of the agent.
|
|
"""
|
|
return self.fitness.values[0]
|
|
|
|
|
|
if __name__ == '__main__':
|
|
'''test important functions and workflows with doctesting
|
|
run this python file by itself to run these tests, and set
|
|
LOGGING=True near top of file.'''
|
|
import doctest
|
|
from functools import partial
|
|
#doctest.testmod()
|
|
test = partial(doctest.run_docstring_examples, globs=globals())
|
|
test(Agent.predictPayoffsForAllActions)
|