In [None]:
import numpy as np, itertools, copy
import matplotlib.pyplot as plt
from collections import defaultdict
import importlib # module reloading

# allow importing from the 'code/' dir
import sys
sys.path.append("../code")

import environments
import agents
# always forces a reload in case you have edited environments or agents
importlib.reload(environments)
importlib.reload(agents)
from environments.gridworld import GridWorld
from agents.q_agent import Agent

# problem domain dependent settings
dims = [4,12]
obsSpace, actSpace = (dims[0], dims[1]), (4,)
num_trials=1000
n_actions = 4
#(optimal lmbda in the agent is domain dependent - could be evolved)
HARD_TIME_LIMIT = 50
KILLED_REWARD = -10
#(standard reward) = -1.0 (means agent is potentially wasting time - set internal to agent code)
#(goal reward) = 1.0 (means the agent achieved something good - set internal to agent code)

# create our own GridWorld that adheres to openAI-gym environment API during training
env = GridWorld(dims = dims, startState = [3,0])

# 4rows x 12columns (0,0) is top-left
# -: empty location
# S: Start location
# G: Goal location
# x: immediate fail (a hole / cliff)
#
# (map of grid world)
# ------------
# ------------
# ------------
# SxxxxxxxxxxG

# add goals and holes
# supports multiple goals, use 1 for now
env.goals.append([3,11])
# support multiple 'kill zones' (cliff edge, in openAI parlance)
for i in range(1,11):
    env.holes.append([3,i])
    
agent = Agent(obsSpace=obsSpace, actSpace=actSpace, alpha=0.1, gamma=0.95, epsilon=0.01, lmbda=0.42)
# alpha     # how much to weigh reward surprises that deviate from expectation
# gamma     # how important exepcted rewards will be
# epsilon   # fraction of exploration to exploitation (how often to choose a random action)
# lmbda     # how slowly memory of preceeding actions fades away (1=never, 0=


time_to_solve_each_trial = [] # lower is better
for trialN in range(num_trials):
    # some output to see it running
    if (trialN % 10) == 0: print('.',end='')
    # initialize the agent, environment, and time for this trial
    agent.reset() # soft-reset() (keeps learned weights)
    nextState = env.reset()
    time = 0
    while True:
        time += 1
        # set agent senses based on environment and allow agent to determine an action
        agent.sensoryState = nextState
        agent.plasticUpdate()
        # determine effect on environment state & any reward (in standard openAI-gym API format)
        nextState, reward, goal_achieved, _ = env.step(agent.action)
        agent.reward = reward
        if goal_achieved or time == HARD_TIME_LIMIT: break
        # stop trial if agent explitly failed early
        elif reward <= KILLED_REWARD:
            agent.sensoryState = nextState
            agent.reward = reward
            agent.plasticUpdate() # allow 1 more update to 'learn' the bad reward
            agent.reset()
            nextState = env.reset()
    # record trial results
    time_to_solve_each_trial.append(time)
    
print()
plt.plot(time_to_solve_each_trial);
pt=15 # font point
plt.title('Time until agent solved trial', fontsize=pt)
plt.xlabel('Trial', fontsize=pt)
plt.ylabel('Time', fontsize=pt)

# show path agent took in GridWorld using non-learning agent (staticUpdate())
print("green dot: start location")
print("red dot: finish location")
env.render(agent)
#render(agent,env)