Python源码示例:baselines.a2c.utils.fc()
示例1
def mlp(num_layers=2, num_hidden=64, activation=tf.tanh):
"""
Stack of fully-connected layers to be used in a policy / q-function approximator
Parameters:
----------
num_layers: int number of fully-connected layers (default: 2)
num_hidden: int size of fully-connected layers (default: 64)
activation: activation function (default: tf.tanh)
Returns:
-------
function that builds fully connected network with a given input tensor / placeholder
"""
def network_fn(X):
h = tf.layers.flatten(X)
for i in range(num_layers):
h = activation(fc(h, 'mlp_fc{}'.format(i), nh=num_hidden, init_scale=np.sqrt(2)))
return h, None
return network_fn
示例2
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
self.pdtype = make_pdtype(ac_space)
X = tf.placeholder(tf.uint8, ob_shape) #obs
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X, **conv_kwargs)
vf = fc(h, 'v', 1)[:,0]
self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = None
def step(ob, *_args, **_kwargs):
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
return a, v, self.initial_state, neglogp
def value(ob, *_args, **_kwargs):
return sess.run(vf, {X:ob})
self.X = X
self.vf = vf
self.step = step
self.value = value
示例3
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False, **conv_kwargs): #pylint: disable=W0613
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
self.pdtype = make_pdtype(ac_space)
X = tf.placeholder(tf.uint8, ob_shape) #obs
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X, **conv_kwargs)
vf = fc(h, 'v', 1)[:,0]
self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = None
def step(ob, *_args, **_kwargs):
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
return a, v, self.initial_state, neglogp
def value(ob, *_args, **_kwargs):
return sess.run(vf, {X:ob})
self.X = X
self.vf = vf
self.step = step
self.value = value
示例4
def nature_cnn(unscaled_images):
"""
CNN from Nature paper.
"""
scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
activ = tf.nn.relu
h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)))
h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)))
h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)))
h3 = conv_to_fc(h3)
return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
示例5
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
pi = fc(h5, 'pi', nact)
vf = fc(h5, 'v', 1)
self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pi)
v0 = vf[:, 0]
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask):
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask})
self.X = X
self.M = M
self.S = S
self.pi = pi
self.vf = vf
self.step = step
self.value = value
示例6
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
pi = fc(h5, 'pi', nact)
vf = fc(h5, 'v', 1)
self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pi)
v0 = vf[:, 0]
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask):
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask})
self.X = X
self.M = M
self.S = S
self.pi = pi
self.vf = vf
self.step = step
self.value = value
示例7
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
pi = fc(h, 'pi', nact, init_scale=0.01)
vf = fc(h, 'v', 1)[:,0]
self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pi)
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = None
def step(ob, *_args, **_kwargs):
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
return a, v, self.initial_state, neglogp
def value(ob, *_args, **_kwargs):
return sess.run(vf, {X:ob})
self.X = X
self.pi = pi
self.vf = vf
self.step = step
self.value = value
示例8
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
ob_shape = (nbatch,) + ob_space.shape
actdim = ac_space.shape[0]
X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
with tf.variable_scope("model", reuse=reuse):
activ = tf.tanh
h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
pi = fc(h2, 'pi', actdim, init_scale=0.01)
h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
vf = fc(h2, 'vf', 1)[:,0]
logstd = tf.get_variable(name="logstd", shape=[1, actdim],
initializer=tf.zeros_initializer())
pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pdparam)
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = None
def step(ob, *_args, **_kwargs):
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
return a, v, self.initial_state, neglogp
def value(ob, *_args, **_kwargs):
return sess.run(vf, {X:ob})
self.X = X
self.pi = pi
self.vf = vf
self.step = step
self.value = value
########################################################################################################################
# Intrinsic Reward Augmented Policies
########################################################################################################################
示例9
def nature_cnn(unscaled_images):
"""
CNN from Nature paper.
"""
scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
activ = tf.nn.relu
h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)))
h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)))
h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)))
h3 = conv_to_fc(h3)
return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
示例10
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
pi = fc(h5, 'pi', nact)
vf = fc(h5, 'v', 1)
self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pi)
v0 = vf[:, 0]
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask):
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask})
self.X = X
self.M = M
self.S = S
self.pi = pi
self.vf = vf
self.step = step
self.value = value
示例11
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
pi = fc(h5, 'pi', nact)
vf = fc(h5, 'v', 1)
self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pi)
v0 = vf[:, 0]
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask):
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask})
self.X = X
self.M = M
self.S = S
self.pi = pi
self.vf = vf
self.step = step
self.value = value
示例12
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
pi = fc(h, 'pi', nact, init_scale=0.01)
vf = fc(h, 'v', 1)[:,0]
self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pi)
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = None
def step(ob, *_args, **_kwargs):
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
return a, v, self.initial_state, neglogp
def value(ob, *_args, **_kwargs):
return sess.run(vf, {X:ob})
self.X = X
self.pi = pi
self.vf = vf
self.step = step
self.value = value
示例13
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False): #pylint: disable=W0613
ob_shape = (nbatch,) + ob_space.shape
actdim = ac_space.shape[0]
X = tf.placeholder(tf.float32, ob_shape, name='Ob') #obs
with tf.variable_scope("model", reuse=reuse):
activ = tf.tanh
h1 = activ(fc(X, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
h2 = activ(fc(h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
pi = fc(h2, 'pi', actdim, init_scale=0.01)
h1 = activ(fc(X, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
h2 = activ(fc(h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
vf = fc(h2, 'vf', 1)[:,0]
logstd = tf.get_variable(name="logstd", shape=[1, actdim],
initializer=tf.zeros_initializer())
pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)
self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pdparam)
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = None
def step(ob, *_args, **_kwargs):
a, v, neglogp = sess.run([a0, vf, neglogp0], {X:ob})
return a, v, self.initial_state, neglogp
def value(ob, *_args, **_kwargs):
return sess.run(vf, {X:ob})
self.X = X
self.pi = pi
self.vf = vf
self.step = step
self.value = value
########################################################################################################################
# Intrinsic Reward Augmented Policies
########################################################################################################################
示例14
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
nbatch = nenv * nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc * nstack)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) # obs
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
pi_logits = fc(h, 'pi', nact, init_scale=0.01)
pi = tf.nn.softmax(pi_logits)
q = fc(h, 'q', nact)
a = sample(pi_logits) # could change this to use self.pi instead
self.initial_state = [] # not stateful
self.X = X
self.pi = pi # actual policy params now
self.q = q
def step(ob, *args, **kwargs):
# returns actions, mus, states
a0, pi0 = sess.run([a, pi], {X: ob})
return a0, pi0, [] # dummy state
def out(ob, *args, **kwargs):
pi0, q0 = sess.run([pi, q], {X: ob})
return pi0, q0
def act(ob, *args, **kwargs):
return sess.run(a, {X: ob})
self.step = step
self.out = out
self.act = act
示例15
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
nbatch = nenv * nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc * nstack)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) # obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
# lstm
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
pi = tf.nn.softmax(pi_logits)
q = fc(h5, 'q', nact)
a = sample(pi_logits) # could change this to use self.pi instead
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
self.X = X
self.M = M
self.S = S
self.pi = pi # actual policy params now
self.q = q
def step(ob, state, mask, *args, **kwargs):
# returns actions, mus, states
a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
return a0, pi0, s
self.step = step
示例16
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
nbatch = nenv * nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc * nstack)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) # obs
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
pi_logits = fc(h, 'pi', nact, init_scale=0.01)
pi = tf.nn.softmax(pi_logits)
q = fc(h, 'q', nact)
a = sample(tf.nn.softmax(pi_logits)) # could change this to use self.pi instead
self.initial_state = [] # not stateful
self.X = X
self.pi = pi # actual policy params now
self.pi_logits = pi_logits
self.q = q
self.vf = q
def step(ob, *args, **kwargs):
# returns actions, mus, states
a0, pi0 = sess.run([a, pi], {X: ob})
return a0, pi0, [] # dummy state
def out(ob, *args, **kwargs):
pi0, q0 = sess.run([pi, q], {X: ob})
return pi0, q0
def act(ob, *args, **kwargs):
return sess.run(a, {X: ob})
self.step = step
self.out = out
self.act = act
示例17
def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
nbatch = nenv * nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc * nstack)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) # obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
# lstm
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
pi = tf.nn.softmax(pi_logits)
q = fc(h5, 'q', nact)
a = sample(pi_logits) # could change this to use self.pi instead
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
self.X = X
self.M = M
self.S = S
self.pi = pi # actual policy params now
self.q = q
def step(ob, state, mask, *args, **kwargs):
# returns actions, mus, states
a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
return a0, pi0, s
self.step = step
示例18
def nature_cnn(unscaled_images, **conv_kwargs):
"""
CNN from Nature paper.
"""
scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
activ = tf.nn.relu
h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
**conv_kwargs))
h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs))
h3 = conv_to_fc(h3)
return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
示例19
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
return self.pdfromflat(pdparam), pdparam
示例20
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
return self.pdfromflat(pdparam), mean
示例21
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
return self.pdfromflat(pdparam), pdparam
示例22
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
return self.pdfromflat(pdparam), mean
示例23
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
pdparam = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
return self.pdfromflat(pdparam), pdparam
# WRONG SECOND DERIVATIVES
# class CategoricalPd(Pd):
# def __init__(self, logits):
# self.logits = logits
# self.ps = tf.nn.softmax(logits)
# @classmethod
# def fromflat(cls, flat):
# return cls(flat)
# def flatparam(self):
# return self.logits
# def mode(self):
# return U.argmax(self.logits, axis=-1)
# def logp(self, x):
# return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
# def kl(self, other):
# return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
# - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
# def entropy(self):
# return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
# def sample(self):
# u = tf.random_uniform(tf.shape(self.logits))
# return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
示例24
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
return self.pdfromflat(pdparam), mean
示例25
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
pdparam = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
return self.pdfromflat(pdparam), pdparam
# WRONG SECOND DERIVATIVES
# class CategoricalPd(Pd):
# def __init__(self, logits):
# self.logits = logits
# self.ps = tf.nn.softmax(logits)
# @classmethod
# def fromflat(cls, flat):
# return cls(flat)
# def flatparam(self):
# return self.logits
# def mode(self):
# return U.argmax(self.logits, axis=-1)
# def logp(self, x):
# return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
# def kl(self, other):
# return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
# - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
# def entropy(self):
# return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
# def sample(self):
# u = tf.random_uniform(tf.shape(self.logits))
# return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
示例26
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
pdparam = fc(latent_vector, 'pi', self.ncat, init_scale=init_scale, init_bias=init_bias)
return self.pdfromflat(pdparam), pdparam
示例27
def pdfromlatent(self, latent_vector, init_scale=1.0, init_bias=0.0):
mean = fc(latent_vector, 'pi', self.size, init_scale=init_scale, init_bias=init_bias)
logstd = tf.get_variable(name='pi/logstd', shape=[1, self.size], initializer=tf.zeros_initializer())
pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
return self.pdfromflat(pdparam), mean
示例28
def nature_cnn(unscaled_images):
"""
CNN from Nature paper.
"""
scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
activ = tf.nn.relu
h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2)))
h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2)))
h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2)))
h3 = conv_to_fc(h3)
return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
示例29
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
pi = fc(h5, 'pi', nact)
vf = fc(h5, 'v', 1)
self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pi)
v0 = vf[:, 0]
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask):
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask})
self.X = X
self.M = M
self.S = S
self.pi = pi
self.vf = vf
self.step = step
self.value = value
示例30
def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, nlstm=256, reuse=False):
nenv = nbatch // nsteps
nh, nw, nc = ob_space.shape
ob_shape = (nbatch, nh, nw, nc)
nact = ac_space.n
X = tf.placeholder(tf.uint8, ob_shape) #obs
M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
with tf.variable_scope("model", reuse=reuse):
h = nature_cnn(X)
xs = batch_to_seq(h, nenv, nsteps)
ms = batch_to_seq(M, nenv, nsteps)
h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
h5 = seq_to_batch(h5)
pi = fc(h5, 'pi', nact)
vf = fc(h5, 'v', 1)
self.pdtype = make_pdtype(ac_space)
self.pd = self.pdtype.pdfromflat(pi)
v0 = vf[:, 0]
a0 = self.pd.sample()
neglogp0 = self.pd.neglogp(a0)
self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
def step(ob, state, mask):
return sess.run([a0, v0, snew, neglogp0], {X:ob, S:state, M:mask})
def value(ob, state, mask):
return sess.run(v0, {X:ob, S:state, M:mask})
self.X = X
self.M = M
self.S = S
self.pi = pi
self.vf = vf
self.step = step
self.value = value