Python源码示例:tensorflow.python.ops.state.assign_sub()
示例1
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
# the following equations given in [1]
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, "m")
m_t = state_ops.assign(m, beta1_t * m + (1. - beta1_t) * grad, use_locking=self._use_locking)
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, "v")
v_t = state_ops.assign(v, beta2_t * v + (1. - beta2_t) * tf.square(grad), use_locking=self._use_locking)
v_prime = self.get_slot(var, "v_prime")
v_t_prime = state_ops.assign(v_prime, tf.maximum(v_prime, v_t))
var_update = state_ops.assign_sub(var,
lr_t * m_t / (tf.sqrt(v_t_prime) + epsilon_t),
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime])
# keras Nadam update rule
示例2
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
if var.dtype.base_dtype == tf.float16:
eps = 1e-7 # Can't use 1e-8 due to underflow -- not sure if it makes a big difference.
else:
eps = 1e-8
v = self.get_slot(var, "v")
v_t = v.assign(beta2_t * v + (1. - beta2_t) * tf.square(grad))
m = self.get_slot(var, "m")
m_t = m.assign( beta1_t * m + (1. - beta1_t) * grad )
v_t_hat = tf.div(v_t, 1. - beta2_t)
m_t_hat = tf.div(m_t, 1. - beta1_t)
g_t = tf.div( m_t, tf.sqrt(v_t)+eps )
g_t_1 = self.get_slot(var, "g")
g_t = g_t_1.assign( g_t )
var_update = state_ops.assign_sub(var, 2. * lr_t * g_t - lr_t * g_t_1) #Adam would be lr_t * g_t
return control_flow_ops.group(*[var_update, m_t, v_t, g_t])
示例3
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
# the following equations given in [1]
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, "m")
m_t = state_ops.assign(m, beta1_t * m + (1. - beta1_t) * grad, use_locking=self._use_locking)
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, "v")
v_t = state_ops.assign(v, beta2_t * v + (1. - beta2_t) * tf.square(grad), use_locking=self._use_locking)
v_prime = self.get_slot(var, "v_prime")
v_t_prime = state_ops.assign(v_prime, tf.maximum(v_prime, v_t))
var_update = state_ops.assign_sub(var,
lr_t * m_t / (tf.sqrt(v_t_prime) + epsilon_t),
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime])
# keras Nadam update rule
示例4
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
if var.dtype.base_dtype == tf.float16:
eps = 1e-7 # Can't use 1e-8 due to underflow -- not sure if it makes a big difference.
else:
eps = 1e-8
v = self.get_slot(var, "v")
v_t = v.assign(beta1_t * v + (1. - beta1_t) * grad)
m = self.get_slot(var, "m")
m_t = m.assign(tf.maximum(beta2_t * m + eps, tf.abs(grad)))
g_t = v_t / m_t
var_update = state_ops.assign_sub(var, lr_t * g_t)
return control_flow_ops.group(*[var_update, m_t, v_t])
示例5
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
if var.dtype.base_dtype == tf.float16:
eps = 1e-7 # Can't use 1e-8 due to underflow -- not sure if it makes a big difference.
else:
eps = 1e-8
v = self.get_slot(var, "v")
v_t = v.assign(beta1_t * v + (1. - beta1_t) * grad)
m = self.get_slot(var, "m")
m_t = m.assign(tf.maximum(beta2_t * m + eps, tf.abs(grad)))
g_t = v_t / m_t
var_update = state_ops.assign_sub(var, lr_t * g_t)
return control_flow_ops.group(*[var_update, m_t, v_t])
示例6
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
if var.dtype.base_dtype == tf.float16:
eps = 1e-7
# Can't use 1e-8 due to underflow -- not sure if it makes a big difference.
else:
eps = 1e-8
v = self.get_slot(var, "v")
v_t = v.assign(beta1_t * v + (1. - beta1_t) * grad)
m = self.get_slot(var, "m")
m_t = m.assign(tf.maximum(beta2_t * m + eps, tf.abs(grad)))
g_t = v_t / m_t
var_update = state_ops.assign_sub(var, lr_t * g_t)
return control_flow_ops.group(*[var_update, m_t, v_t])
示例7
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
if var.dtype.base_dtype == tf.float16:
eps = 1e-7
# Can't use 1e-8 due to underflow -- not sure if it makes a big difference.
else:
eps = 1e-8
v = self.get_slot(var, "v")
v_t = v.assign(beta1_t * v + (1. - beta1_t) * grad)
m = self.get_slot(var, "m")
m_t = m.assign(tf.maximum(beta2_t * m + eps, tf.abs(grad)))
g_t = v_t / m_t
var_update = state_ops.assign_sub(var, lr_t * g_t)
return control_flow_ops.group(*[var_update, m_t, v_t])
示例8
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
# the following equations given in [1]
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, "m")
m_t = state_ops.assign(m, beta1_t * m + (1. - beta1_t) * grad, use_locking=self._use_locking)
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, "v")
v_t = state_ops.assign(v, beta2_t * v + (1. - beta2_t) * tf.square(grad), use_locking=self._use_locking)
v_prime = self.get_slot(var, "v_prime")
v_t_prime = state_ops.assign(v_prime, tf.maximum(v_prime, v_t))
var_update = state_ops.assign_sub(var,
lr_t * m_t / (tf.sqrt(v_t_prime) + epsilon_t),
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime])
# keras Nadam update rule
示例9
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
if var.dtype.base_dtype == tf.float16:
eps = 1e-7 # Can't use 1e-8 due to underflow -- not sure if it makes a big difference.
else:
eps = 1e-8
v = self.get_slot(var, "v")
v_t = v.assign(beta1_t * v + (1. - beta1_t) * grad)
m = self.get_slot(var, "m")
m_t = m.assign(tf.maximum(beta2_t * m + eps, tf.abs(grad)))
g_t = v_t / m_t
var_update = state_ops.assign_sub(var, lr_t * g_t)
return control_flow_ops.group(*[var_update, m_t, v_t])
示例10
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype)
beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
eps = 1e-7 # cap for moving average
m = self.get_slot(var, "m")
m_t = m.assign(tf.maximum(beta_t * m + eps, tf.abs(grad)))
var_update = state_ops.assign_sub(var, lr_t * grad * tf.exp(
tf.log(alpha_t) * tf.sign(grad) * tf.sign(m_t))) # Update 'ref' by subtracting 'value
# Create an op that groups multiple operations.
# When this op finishes, all ops in input have finished
return control_flow_ops.group(*[var_update, m_t])
示例11
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype)
eps = 1e-7 # cap for moving average
m = self.get_slot(var, "m")
m_t = m.assign(tf.maximum(beta_t * m + eps, tf.abs(grad)))
var_update = state_ops.assign_sub(var, lr_t * grad * (1.0 + alpha_t * tf.sign(grad) * tf.sign(m_t)))
# Create an op that groups multiple operations
# When this op finishes, all ops in input have finished
return control_flow_ops.group(*[var_update, m_t])
示例12
def _apply_dense(self, grad, var):
beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, "m")
m_scaled_g_values = grad * (1 - beta1_t)
m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values,
use_locking=self._use_locking)
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, "v")
v_scaled_g_values = (grad * grad) * (1 - beta2_t)
v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values,
use_locking=self._use_locking)
# amsgrad
vhat = self.get_slot(var, "vhat")
vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat))
v_sqrt = math_ops.sqrt(vhat_t)
var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t),
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t])
示例13
def _resource_apply_dense(self, grad, var):
var = var.handle
beta1_power = math_ops.cast(self._beta1_power, grad.dtype.base_dtype)
beta2_power = math_ops.cast(self._beta2_power, grad.dtype.base_dtype)
lr_t = math_ops.cast(self._lr_t, grad.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, grad.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, grad.dtype.base_dtype)
epsilon_t = math_ops.cast(self._epsilon_t, grad.dtype.base_dtype)
lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, "m").handle
m_scaled_g_values = grad * (1 - beta1_t)
m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values,
use_locking=self._use_locking)
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, "v").handle
v_scaled_g_values = (grad * grad) * (1 - beta2_t)
v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values,
use_locking=self._use_locking)
# amsgrad
vhat = self.get_slot(var, "vhat").handle
vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat))
v_sqrt = math_ops.sqrt(vhat_t)
var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t),
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t])
示例14
def _apply_sparse_shared(self, grad, var, indices, scatter_add):
beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, "m")
m_scaled_g_values = grad * (1 - beta1_t)
m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
with ops.control_dependencies([m_t]):
m_t = scatter_add(m, indices, m_scaled_g_values)
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, "v")
v_scaled_g_values = (grad * grad) * (1 - beta2_t)
v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
with ops.control_dependencies([v_t]):
v_t = scatter_add(v, indices, v_scaled_g_values)
# amsgrad
vhat = self.get_slot(var, "vhat")
vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat))
v_sqrt = math_ops.sqrt(vhat_t)
var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t),
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t])
示例15
def _apply_dense(self, grad, var):
beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, "m")
m_scaled_g_values = grad * (1 - beta1_t)
m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values, use_locking=self._use_locking)
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, "v")
v_scaled_g_values = (grad * grad) * (1 - beta2_t)
v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values, use_locking=self._use_locking)
# amsgrad
vhat = self.get_slot(var, "vhat")
vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat))
v_sqrt = math_ops.sqrt(vhat_t)
var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t])
示例16
def _resource_apply_dense(self, grad, var):
var = var.handle
beta1_power = math_ops.cast(self._beta1_power, grad.dtype.base_dtype)
beta2_power = math_ops.cast(self._beta2_power, grad.dtype.base_dtype)
lr_t = math_ops.cast(self._lr_t, grad.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, grad.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, grad.dtype.base_dtype)
epsilon_t = math_ops.cast(self._epsilon_t, grad.dtype.base_dtype)
lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, "m").handle
m_scaled_g_values = grad * (1 - beta1_t)
m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values, use_locking=self._use_locking)
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, "v").handle
v_scaled_g_values = (grad * grad) * (1 - beta2_t)
v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values, use_locking=self._use_locking)
# amsgrad
vhat = self.get_slot(var, "vhat").handle
vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat))
v_sqrt = math_ops.sqrt(vhat_t)
var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t])
示例17
def _apply_sparse_shared(self, grad, var, indices, scatter_add):
beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, "m")
m_scaled_g_values = grad * (1 - beta1_t)
m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
with ops.control_dependencies([m_t]):
m_t = scatter_add(m, indices, m_scaled_g_values)
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, "v")
v_scaled_g_values = (grad * grad) * (1 - beta2_t)
v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
with ops.control_dependencies([v_t]):
v_t = scatter_add(v, indices, v_scaled_g_values)
# amsgrad
vhat = self.get_slot(var, "vhat")
vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat))
v_sqrt = math_ops.sqrt(vhat_t)
var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t])
示例18
def _apply_dense(self, grad, var):
# Calculates the preconditioner statistics for each tensor.
partitioned_grads = TensorPartitioner.partition_tensor(
grad, self._partition_info)
shape = var.get_shape()
fallback_to_diagonal = self._fallback_to_diagonal_for_shape(shape)
precond_statistics_update = []
if not fallback_to_diagonal:
precond_statistics_update = self._updated_statistics(
var, partitioned_grads)
accumulator = self.get_slot(var, "accumulator")
accumulator_updated = state_ops.assign_add(accumulator, grad * grad)
accumulator_inv_sqrt = math_ops.rsqrt(accumulator_updated + 1e-30)
if self._momentum > 0.0:
scaled_g = (1.0 - self._momentum_tensor) * (grad * accumulator_inv_sqrt)
gbar = self.get_slot(var, "momentum")
gbar_updated = state_ops.assign_add(
gbar,
gbar * (self._momentum_tensor - 1.0) + scaled_g)
else:
gbar_updated = (grad * accumulator_inv_sqrt)
if not fallback_to_diagonal:
# Update the preconditioner statistics followed by computing the
# preconditioned gradient.
with ops.control_dependencies(precond_statistics_update):
s = tf.cast(self._run_nondiagonal_update, tf.float32)
preconditioned_grad = self._preconditioned_update(
var, partitioned_grads, gbar_updated)
# slowly adapt from diagonal to preconditioned gradient.
w = self._run_nondiagonal_update_warmup
warmup_update = s * self._learning_rate_tensor * (
w * preconditioned_grad + (1.0 - w) * gbar_updated)
fallback_update = (1 - s) * (self._learning_rate_tensor * gbar_updated)
return state_ops.assign_sub(var, warmup_update + fallback_update)
else:
return state_ops.assign_sub(var,
self._learning_rate_tensor * gbar_updated)
示例19
def assign_sub(self, delta, use_locking=False):
"""Subtracts a value from this variable.
This is essentially a shortcut for `assign_sub(self, delta)`.
Args:
delta: A `Tensor`. The value to subtract from this variable.
use_locking: If `True`, use locking during the operation.
Returns:
A `Tensor` that will hold the new value of this variable after
the subtraction has completed.
"""
return state_ops.assign_sub(self._variable, delta, use_locking=use_locking)
示例20
def _apply_sparse_shared(self, grad, var, indices, scatter_add):
beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, "m")
m_scaled_g_values = grad * (1 - beta1_t)
m_t = state_ops.assign(m, m * beta1_t,
use_locking=self._use_locking)
with ops.control_dependencies([m_t]):
m_t = scatter_add(m, indices, m_scaled_g_values)
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, "v")
v_scaled_g_values = (grad * grad) * (1 - beta2_t)
v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
with ops.control_dependencies([v_t]):
v_t = scatter_add(v, indices, v_scaled_g_values)
v_sqrt = math_ops.sqrt(v_t)
var_update = state_ops.assign_sub(var,
lr * m_t / (v_sqrt + epsilon_t),
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t])
示例21
def _mini_batch_sync_updates_op(self, update_in_steps,
cluster_centers_var, cluster_centers_updated,
total_counts):
if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
assert update_in_steps is not None
with ops.colocate_with(update_in_steps):
def _f():
# Note that there is a race condition here, so we do a best effort
# updates here. We reset update_in_steps first so that other workers
# don't duplicate the updates. Also we update cluster_center_vars
# before resetting total_counts to avoid large updates to
# cluster_centers_updated based on partially updated
# cluster_center_vars.
with ops.control_dependencies([state_ops.assign(
update_in_steps,
self._mini_batch_steps_per_iteration - 1)]):
with ops.colocate_with(cluster_centers_updated):
if self._distance_metric == COSINE_DISTANCE:
cluster_centers = nn_impl.l2_normalize(cluster_centers_updated,
dim=1)
else:
cluster_centers = cluster_centers_updated
with ops.colocate_with(cluster_centers_var):
with ops.control_dependencies([state_ops.assign(
cluster_centers_var,
cluster_centers)]):
with ops.colocate_with(cluster_centers_var):
with ops.control_dependencies([
state_ops.assign(total_counts,
array_ops.zeros_like(total_counts))]):
return array_ops.identity(update_in_steps)
return control_flow_ops.cond(
update_in_steps <= 0,
_f,
lambda: state_ops.assign_sub(update_in_steps, 1))
else:
return control_flow_ops.no_op()
示例22
def update_sub(x, decrement):
return state_ops.assign_sub(x, decrement)
示例23
def assign_sub(self, delta, use_locking=False):
"""Subtracts a value from this variable.
This is essentially a shortcut for `assign_sub(self, delta)`.
Args:
delta: A `Tensor`. The value to subtract from this variable.
use_locking: If `True`, use locking during the operation.
Returns:
A `Tensor` that will hold the new value of this variable after
the subtraction has completed.
"""
return state_ops.assign_sub(self._variable, delta, use_locking=use_locking)
示例24
def _apply_sparse(self, grad, var):
beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
# m_t = beta1 * m + (1 - beta1) * g_t
m = self.get_slot(var, "m")
m_scaled_g_values = grad.values * (1 - beta1_t)
m_t = state_ops.assign(m, m * beta1_t,
use_locking=self._use_locking)
m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values,
use_locking=self._use_locking)
# v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
v = self.get_slot(var, "v")
v_scaled_g_values = (grad.values * grad.values) * (1 - beta2_t)
v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
v_t = state_ops.scatter_add(v_t, grad.indices, v_scaled_g_values,
use_locking=self._use_locking)
v_sqrt = math_ops.sqrt(v_t)
var_update = state_ops.assign_sub(var,
lr * m_t / (v_sqrt + epsilon_t),
use_locking=self._use_locking)
return control_flow_ops.group(*[var_update, m_t, v_t])
示例25
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype)
beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
eps = 1e-7 # cap for moving average
m = self.get_slot(var, "m")
m_t = m.assign(tf.maximum(beta_t * m + eps, tf.abs(grad)))
var_update = state_ops.assign_sub(var, lr_t * grad * tf.exp(
tf.log(alpha_t) * tf.sign(grad) * tf.sign(m_t))) # Update 'ref' by subtracting 'value
# Create an op that groups multiple operations.
# When this op finishes, all ops in input have finished
return control_flow_ops.group(*[var_update, m_t])
示例26
def _apply_dense(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype)
eps = 1e-7 # cap for moving average
m = self.get_slot(var, "m")
m_t = m.assign(tf.maximum(beta_t * m + eps, tf.abs(grad)))
var_update = state_ops.assign_sub(var, lr_t * grad * (1.0 + alpha_t * tf.sign(grad) * tf.sign(m_t)))
# Create an op that groups multiple operations
# When this op finishes, all ops in input have finished
return control_flow_ops.group(*[var_update, m_t])
示例27
def _resource_apply_dense(self, grad, var):
step, beta1_power, beta2_power = self._get_beta_accumulators()
beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
sma_inf = 2.0 / (1.0 - beta2_t) - 1.0
sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power)
m = self.get_slot(var, "m")
m_t = state_ops.assign(m, beta1_t * m + (1.0 - beta1_t) * grad, use_locking=self._use_locking)
mhat_t = m_t / (1.0 - beta1_power)
v = self.get_slot(var, "v")
v_t = state_ops.assign(v, beta2_t * v + (1.0 - beta2_t) * math_ops.square(grad), use_locking=self._use_locking)
vhat_t = math_ops.sqrt(v_t / ((1.0 - beta2_power) + epsilon_t))
r_t = math_ops.sqrt( ((sma_t - 4.0) * (sma_t - 2.0) * sma_inf) / ((sma_inf - 4.0) * (sma_inf - 2.0) * sma_t) )
var_t = tf.cond(sma_t >= 5.0, lambda : r_t * mhat_t / vhat_t, lambda : mhat_t)
if self._weight_decay > 0.0:
var_t += math_ops.cast(self._weight_decay_t, var.dtype.base_dtype) * var
var_update = state_ops.assign_sub(var, lr_t * var_t, use_locking=self._use_locking)
updates = [var_update, m_t, v_t]
return control_flow_ops.group(*updates)
示例28
def _decay_weights_op(self, var):
def apply_decay_fn(v, decay):
return state_ops.assign_sub(v, decay * v, self._use_locking)
def merge_fn(strategy, v, value):
value = strategy.reduce(
variable_scope.VariableAggregation.MEAN, value, v)
return strategy.update(v, apply_decay_fn, value)
if not self._decay_var_list or var in self._decay_var_list:
replica_context = distribute_lib.get_tower_context()
return replica_context.merge_call(merge_fn, var, self._weight_decay)
return control_flow_ops.no_op()
示例29
def _apply_grad_descent(self, grad, var):
lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
var_update = state_ops.assign_sub(var, grad * lr_t)
return control_flow_ops.group(*[var_update])
# Thanks to https://github.com/google/asymproj_edge_dnn/blob/master/edge_nn.py
示例30
def _apply_pd(self, grad, var):
def PlusEpsilon(x, eps=1e-6):
"""Element-wise add `eps` to `x` without changing sign of `x`."""
return x + (tf.sign(x) * eps)
target_t = math_ops.cast(self._target_t, var.dtype.base_dtype)
mean_percent_grad = tf.reduce_mean(tf.abs(tf.div(grad, PlusEpsilon(var))))
lr_t = tf.div(target_t, (mean_percent_grad + 1e-5))
var_update = state_ops.assign_sub(var, grad * lr_t)
return var_update