update code for chapter 13

maxpumperla · Aug 27, 2018 · 50aa0ce · 50aa0ce
1 parent e008cc9
commit 50aa0ce
Show file tree

Hide file tree

Showing 20 changed files with 792 additions and 48 deletions.
diff --git a/code/dlgo/agent/__init__.py b/code/dlgo/agent/__init__.py
@@ -1,5 +1,7 @@
+from .alphago import *
 from .base import *
 from .pg import *
 from .predict import *
 from .naive import *
 from .naive_fast import *
+from .termination import *
diff --git a/code/dlgo/agent/alphago.py b/code/dlgo/agent/alphago.py
@@ -0,0 +1,168 @@
+# tag::alphago_imports[]
+import numpy as np
+from dlgo.agent.base import Agent
+from dlgo.goboard_fast import Move
+from dlgo import kerasutil
+import operator
+# end::alphago_imports[]
+
+
+__all__ = [
+    'AlphaGoNode',
+    'AlphaGoMCTS'
+]
+
+
+# tag::init_alphago_node[]
+class AlphaGoNode:
+    def __init__(self, parent=None, probability=1.0):
+        self.parent = parent  # <1>
+        self.children = {}  # <1>
+
+        self.visit_count = 0
+        self.q_value = 0
+        self.prior_value = probability  # <2>
+        self.u_value = probability  # <3>
+# <1> Tree nodes have one parent and potentially many children.
+# <2> A node is initialized with a prior probability.
+# <3> The utility function will be updated during search.
+# end::init_alphago_node[]
+
+# tag::select_node[]
+    def select_child(self):
+        return max(self.children.items(),
+                   key=lambda child: child[1].q_value + \
+                   child[1].u_value)
+# end::select_node[]
+
+# tag::expand_children[]
+    def expand_children(self, moves, probabilities):
+        for move, prob in zip(moves, probabilities):
+            if move not in self.children:
+                self.children[move] = AlphaGoNode(probability=prob)
+# end::expand_children[]
+
+# tag::update_values[]
+    def update_values(self, leaf_value):
+        if self.parent is not None:
+            self.parent.update_values(leaf_value)  # <1>
+
+        self.visit_count += 1  # <2>
+
+        self.q_value += leaf_value / self.visit_count  # <3>
+
+        if self.parent is not None:
+            c_u = 5
+            self.u_value = c_u * np.sqrt(self.parent.visit_count) \
+                * self.prior_value / (1 + self.visit_count)  # <4>
+
+# <1> We update parents first to ensure we traverse the tree top to bottom.
+# <2> Increment the visit count for this node.
+# <3> Add the specified leaf value to the Q-value, normalized by visit count.
+# <4> Update utility with current visit counts.
+# end::update_values[]
+
+
+# tag::alphago_mcts_init[]
+class AlphaGoMCTS(Agent):
+    def __init__(self, policy_agent, fast_policy_agent, value_agent,
+                 lambda_value=0.5, num_simulations=1000,
+                 depth=50, rollout_limit=100):
+        self.policy = policy_agent
+        self.rollout_policy = fast_policy_agent
+        self.value = value_agent
+
+        self.lambda_value = lambda_value
+        self.num_simulations = num_simulations
+        self.depth = depth
+        self.rollout_limit = rollout_limit
+        self.root = AlphaGoNode()
+# end::alphago_mcts_init[]
+
+# tag::alphago_mcts_rollout[]
+    def select_move(self, game_state):
+        for simulation in range(self.num_simulations):  # <1>
+            current_state = game_state
+            node = self.root
+            for depth in range(self.depth):  # <2>
+                if not node.children:  # <3>
+                    if current_state.is_over():
+                        break
+                    moves, probabilities = self.policy_probabilities(current_state)  # <4>
+                    node.expand_children(moves, probabilities)  # <4>
+
+                move, node = node.select_child()  # <5>
+                current_state = current_state.apply_move(move)  # <5>
+
+            value = self.value.predict(current_state)  # <6>
+            rollout = self.policy_rollout(current_state)  # <6>
+
+            weighted_value = (1 - self.lambda_value) * value + \
+                self.lambda_value * rollout  # <7>
+
+            node.update_values(weighted_value)  # <8>
+# <1> From current state play out a number of simulations
+# <2> Play moves until the specified depth is reached.
+# <3> If the current node doesn't have any children...
+# <4> ... expand them with probabilities from the strong policy.
+# <5> If there are children, we can select one and play the corresponding move.
+# <6> Compute output of value network and a rollout by the fast policy.
+# <7> Determine the combined value function.
+# <8> Update values for this node in the backup phase
+# end::alphago_mcts_rollout[]
+
+# tag::alphago_mcts_selection[]
+        move = max(self.root.children, key=lambda move:  # <1>
+                   self.root.children.get(move).visit_count)  # <1>
+
+        self.root = AlphaGoNode()
+        if move in self.root.children:  # <2>
+            self.root = self.root.children[move]
+            self.root.parent = None
+
+        return move
+# <1> Pick most visited child of the root as next move.
+# <2> If the picked move is a child, set new root to this child node.
+# end::alphago_mcts_selection[]
+
+# tag::alphago_policy_probs[]
+    def policy_probabilities(self, game_state):
+        encoder = self.policy._encoder
+        outputs = self.policy.predict(game_state)
+        legal_moves = game_state.legal_moves()
+        if not legal_moves:
+            return [], []
+        encoded_points = [encoder.encode_point(move.point) for move in legal_moves if move.point]
+        legal_outputs = outputs[encoded_points]
+        normalized_outputs = legal_outputs / np.sum(legal_outputs)
+        return legal_moves, normalized_outputs
+# end::alphago_policy_probs[]
+
+# tag::alphago_policy_rollout[]
+    def policy_rollout(self, game_state):
+        for step in range(self.rollout_limit):
+            if game_state.is_over():
+                break
+            move_probabilities = self.rollout_policy.predict(game_state)
+            encoder = self.rollout_policy.encoder
+            valid_moves = [m for idx, m in enumerate(move_probabilities) 
+                           if Move(encoder.decode_point_index(idx)) in game_state.legal_moves()]
+            max_index, max_value = max(enumerate(valid_moves), key=operator.itemgetter(1))
+            max_point = encoder.decode_point_index(max_index)
+            greedy_move = Move(max_point)
+            if greedy_move in game_state.legal_moves():
+                game_state = game_state.apply_move(greedy_move)
+
+        next_player = game_state.next_player
+        winner = game_state.winner()
+        if winner is not None:
+            return 1 if winner == next_player else -1
+        else:
+            return 0
+# end::alphago_policy_rollout[]
+
+
+    def serialize(self, h5file):
+        raise IOError("AlphaGoMCTS agent can\'t be serialized" +
+                       "consider serializing the three underlying" +
+                       "neural networks instad.")
diff --git a/code/dlgo/agent/alphago_test.py b/code/dlgo/agent/alphago_test.py
@@ -0,0 +1,86 @@
+import unittest
+
+from dlgo.data.processor import GoDataProcessor
+from dlgo.agent.predict import DeepLearningAgent
+from dlgo.networks.alphago import alphago_model
+from dlgo.agent.pg import PolicyAgent
+from dlgo.agent.predict import load_prediction_agent
+from dlgo.encoders.alphago import AlphaGoEncoder
+from dlgo.rl.simulate import experience_simulation
+from dlgo.networks.alphago import alphago_model
+from dlgo.rl import ValueAgent, load_experience
+from dlgo.agent import load_prediction_agent, load_policy_agent, AlphaGoMCTS
+from dlgo.rl import load_value_agent
+from dlgo.goboard_fast import GameState
+
+from keras.callbacks import ModelCheckpoint
+import h5py
+import numpy as np
+
+class AlphaGoAgentTest(unittest.TestCase):
+    def test_1_supervised_learning(self):
+        rows, cols = 19, 19
+        encoder = AlphaGoEncoder()
+
+        input_shape = (encoder.num_planes, rows, cols)
+        alphago_sl_policy = alphago_model(input_shape, is_policy_net=True)
+
+        alphago_sl_policy.compile('sgd', 'categorical_crossentropy', metrics=['accuracy'])
+
+        alphago_sl_agent = DeepLearningAgent(alphago_sl_policy, encoder)
+
+        inputs = np.ones((10,) + input_shape)
+        outputs = alphago_sl_policy.predict(inputs)
+        assert(outputs.shape == (10, 361))
+
+        with h5py.File('test_alphago_sl_policy.h5', 'w') as sl_agent_out:
+            alphago_sl_agent.serialize(sl_agent_out)
+
+    def test_2_reinforcement_learning(self):
+        encoder = AlphaGoEncoder()
+
+        sl_agent = load_prediction_agent(h5py.File('test_alphago_sl_policy.h5'))
+        sl_opponent = load_prediction_agent(h5py.File('test_alphago_sl_policy.h5'))
+
+        alphago_rl_agent = PolicyAgent(sl_agent.model, encoder)
+        opponent = PolicyAgent(sl_opponent.model, encoder)
+
+        num_games = 1
+        experience = experience_simulation(num_games, alphago_rl_agent, opponent)
+
+        alphago_rl_agent.train(experience)
+
+        with h5py.File('test_alphago_rl_policy.h5', 'w') as rl_agent_out:
+            alphago_rl_agent.serialize(rl_agent_out)
+
+        with h5py.File('test_alphago_rl_experience.h5', 'w') as exp_out:
+           experience.serialize(exp_out)        
+
+    def test_3_alphago_value(self):
+        rows, cols = 19, 19
+        encoder = AlphaGoEncoder()
+        input_shape = (encoder.num_planes, rows, cols)
+        alphago_value_network = alphago_model(input_shape)
+
+        alphago_value = ValueAgent(alphago_value_network, encoder)
+
+        experience = load_experience(h5py.File('test_alphago_rl_experience.h5', 'r'))
+
+        alphago_value.train(experience)
+
+        with h5py.File('test_alphago_value.h5', 'w') as value_agent_out:
+            alphago_value.serialize(value_agent_out)
+
+    def test_4_alphago_mcts(self):
+        fast_policy = load_prediction_agent(h5py.File('test_alphago_sl_policy.h5', 'r'))
+        strong_policy = load_policy_agent(h5py.File('test_alphago_rl_policy.h5', 'r'))
+        value = load_value_agent(h5py.File('test_alphago_value.h5', 'r'))
+
+        alphago = AlphaGoMCTS(strong_policy, fast_policy, value,
+                              num_simulations=20, depth=5, rollout_limit=10)
+        start = GameState.new_game(19)
+        alphago.select_move(start)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/code/dlgo/agent/base.py b/code/dlgo/agent/base.py
@@ -4,8 +4,10 @@
 
 
 # tag::agent[]
-class Agent():
-    """Interface for a go-playing bot."""
+class Agent:
+    def __init__(self):
+        pass
+
     def select_move(self, game_state):
         raise NotImplementedError()
 # end::agent[]

diff --git a/code/dlgo/agent/naive_fast.py b/code/dlgo/agent/naive_fast.py
@@ -11,6 +11,7 @@
 
 class FastRandomBot(Agent):
     def __init__(self):
+        Agent.__init__(self)
         self.dim = None
         self.point_cache = []
 

diff --git a/code/dlgo/agent/pg.py b/code/dlgo/agent/pg.py
@@ -12,32 +12,37 @@
 __all__ = [
     'PolicyAgent',
     'load_policy_agent',
+    'policy_gradient_loss',
 ]
 
 
+# Keeping this around so we can read existing agents. But from now on
+# we'll use the built-in crossentropy loss.
+def policy_gradient_loss(y_true, y_pred):
+    clip_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
+    loss = -1 * y_true * K.log(clip_pred)
+    return K.mean(K.sum(loss, axis=1))
+
+
 def normalize(x):
     total = np.sum(x)
     return x / total
 
 
-def prepare_experience_data(experience, board_width, board_height):
-    experience_size = experience.actions.shape[0]
-    target_vectors = np.zeros((experience_size, board_width * board_height))
-    for i in range(experience_size):
-        action = experience.actions[i]
-        reward = experience.rewards[i]
-        target_vectors[i][action] = reward
-    return target_vectors
-
-
 class PolicyAgent(Agent):
     """An agent that uses a deep policy network to select moves."""
     def __init__(self, model, encoder):
+        Agent.__init__(self)
         self._model = model
         self._encoder = encoder
         self._collector = None
         self._temperature = 0.0
 
+    def predict(self, game_state):
+        encoded_state = self._encoder.encode(game_state)
+        input_tensor = np.array([encoded_state])
+        return self._model.predict(input_tensor)[0]
+
     def set_temperature(self, temperature):
         self._temperature = temperature
 
@@ -48,14 +53,14 @@ def select_move(self, game_state):
         num_moves = self._encoder.board_width * self._encoder.board_height
 
         board_tensor = self._encoder.encode(game_state)
-        X = np.array([board_tensor])
+        x = np.array([board_tensor])
 
         if np.random.random() < self._temperature:
             # Explore random moves.
             move_probs = np.ones(num_moves) / num_moves
         else:
             # Follow our current policy.
-            move_probs = self._model.predict(X)[0]
+            move_probs = self._model.predict(x)[0]
 
         # Prevent move probs from getting stuck at 0 or 1.
         eps = 1e-5
@@ -90,24 +95,29 @@ def serialize(self, h5file):
         h5file.create_group('model')
         kerasutil.save_model_to_hdf5_group(self._model, h5file['model'])
 
-    def train(self, experience, lr, clipnorm, batch_size):
-        self._model.compile(
-            loss='categorical_crossentropy',
-            optimizer=SGD(lr=lr, clipnorm=clipnorm))
+    def train(self, experience, lr=0.0000001, clipnorm=1.0, batch_size=512):
+        opt = SGD(lr=lr, clipnorm=clipnorm)
+        self._model.compile(loss='categorical_crossentropy', optimizer=opt)
 
-        target_vectors = prepare_experience_data(
-            experience,
-            self._encoder.board_width,
-            self._encoder.board_height)
+        n = experience.states.shape[0]
+        # Translate the actions/rewards.
+        num_moves = self._encoder.board_width * self._encoder.board_height
+        y = np.zeros((n, num_moves))
+        for i in range(n):
+            action = experience.actions[i]
+            reward = experience.rewards[i]
+            y[i][action] = reward
 
         self._model.fit(
-            experience.states, target_vectors,
+            experience.states, y,
             batch_size=batch_size,
             epochs=1)
 
 
 def load_policy_agent(h5file):
-    model = kerasutil.load_model_from_hdf5_group(h5file['model'])
+    model = kerasutil.load_model_from_hdf5_group(
+        h5file['model'],
+        custom_objects={'policy_gradient_loss': policy_gradient_loss})
     encoder_name = h5file['encoder'].attrs['name']
     if not isinstance(encoder_name, str):
         encoder_name = encoder_name.decode('ascii')