In [None]:
import numpy as np, itertools, copy
import matplotlib.pyplot as plt
from collections import defaultdict
import importlib # module reloading

# allow importing from the 'code/' dir
import sys
sys.path.append("../code")

import environments
import agents
# always forces a reload in case you have edited environments or agents
importlib.reload(environments)
importlib.reload(agents)
from environments.puzzle import Puzzle, ConvBelt, Action, getActionSpace, getObservationSpace
from agents.q_agent import Agent

import copy # allows duplicating puzzles into unique puzzles, otherwise python refs are shallow-copied
maxrewards = [1] # could have multiple levels of 'goodness'

# Create a puzzle with 4 states:
#                        state 0: first presentation
#                        state 1: getting passed over, advancing on belt (not really a state, more a placeholder)
#                        state 2: investigated (more sensory information is available when examined closely)
#                        state 3: consumed (saturating state with possible reward)
easy_puzzle_tt = np.array([[0,0,2,3],  # state 0: first presentation
                           [0,0,0,0],  # state 1: getting passed over (placeholder)
                           [2,0,2,3],  # state 2: investigated
                           [3,3,3,3]]) # state 3: consumed
# example puzzle with 2 sensorial dimensions
easy_puzzle_features = [[0,1], # state 0: Empty/Unknown & Spikes
                        [0,1], # state 1: Empty/Unknown & Spikes
                        [3,1], # state 2: Red & Spikes
                        [0,0]] # state 3: Empty/Unknown & Empty/Unknown
easy_puzzle_rewards = [-1, # state 0: first look
                       -1, # state 1: proceeding to next puzzle (placeholder)
                       -1, # state 2: investigate
                       1]  # state 3: consume (could be -10 poisonous! or -1 empty/useless)
p1 = Puzzle(tt = easy_puzzle_tt,
            features = easy_puzzle_features,
            rewards = easy_puzzle_rewards)
p2 = copy.deepcopy(p1)
puzzles = (p1,p2)


obsSpace = getObservationSpace(puzzles)
actSpace = getActionSpace(puzzles)


env = ConvBelt(actionSpace = getActionSpace(puzzles), # indicate number of actions agent can take
         observationSpace = getObservationSpace(puzzles), # indicate number of sensorial dimensions and sizes
         maxRewards = maxrewards, # rewards that constitute postive rewards
         randomize = False, # randomize puzzle positions on belt at each reset()
        )

# can use append() or extend()
env.append(p1)
env.append(p2)

# domain-specific settings
num_trials=200
n_actions = 4
#(optimal lmbda in the agent is domain dependent - could be evolved)
HARD_TIME_LIMIT = 600
#KILLED_REWARD = -10 # not used here
#(standard reward) = -1.0 (means agent is potentially wasting time - set internal to agent code)
#(goal reward) = 1.0 (means the agent achieved something good - set internal to agent code)

agent = Agent(obsSpace=obsSpace, actSpace=actSpace, alpha=0.1, gamma=0.95, epsilon=0.01, lmbda=0.42)
# alpha     # how much to weigh reward surprises that deviate from expectation
# gamma     # how important exepcted rewards will be
# epsilon   # fraction of exploration to exploitation (how often to choose a random action)
# lmbda     # how slowly memory of preceeding actions fades away (1=never, 0=

time_to_solve_each_trial = []
rewards = []

for trialN in range(num_trials):
    # some output to see it running
    if (trialN % 10) == 0: print('.',end='')
    # initialize the agent, environment, and time for this trial
    agent.reset() # soft-reset() (keeps learned weights)
    nextState = env.reset()
    time = 0
    while True:
        time += 1
        # set agent senses based on environment and allow agent to determine an action
        agent.sensoryState = nextState
        agent.plasticUpdate()
        # determine effect on environment state & any reward (in standard openAI-gym API format)
        nextState, reward, goal_achieved, _ = env.step(agent.action)
        agent.reward = reward
        if env.puzzlesLeftToComplete == 0 or time == HARD_TIME_LIMIT:
            agent.plasticUpdate()
            break
        # could have deadly rewards that stop the trial early
        #elif reward <= -10:
        #    agent.sensoryState = nextState
        #    agent.reward = reward
        #    agent.plasticUpdate()
        #    agent.reset()
        #    nextState = env.reset()
        rewards.append(reward)
    time_to_solve_each_trial.append(time)
    
    
print()
print(list(agent.weights.round(3)))
#print(agent.timeSinceBigSurprise)
plt.figure(figsize=(16,4),dpi=200)
plt.plot(time_to_solve_each_trial)
pt=15 # font point
plt.title('Time until agent solved trial (puzzle boxes)', fontsize=pt)
plt.xlabel('Trial', fontsize=pt)
plt.ylabel('Time', fontsize=pt)
#figure()
#plot(rewards)
env.render(agent);