"""Built-in loss functions. """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import abc import six from . import backend as K from .utils import losses_utils from .utils.generic_utils import deserialize_keras_object from .utils.generic_utils import serialize_keras_object @six.add_metaclass(abc.ABCMeta) class Loss(object): """Loss base class. To be implemented by subclasses: * `call()`: Contains the logic for loss calculation using `y_true`, `y_pred`. Example subclass implementation: ```python class MeanSquaredError(Loss): def call(self, y_true, y_pred): y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) return K.mean(math_ops.square(y_pred - y_true), axis=-1) ``` # Arguments reduction: (Optional) Type of loss Reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: Optional name for the object. """ def __init__(self, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name=None): self.reduction = reduction self.name = name def __call__(self, y_true, y_pred, sample_weight=None): """Invokes the `Loss` instance. # Arguments y_true: Ground truth values. y_pred: The predicted values. sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as `y_true`, or is broadcastable to `y_true`. `sample_weight` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `sample_weight` is a tensor of size `[batch_size]`, then the total loss for each sample of the batch is rescaled by the corresponding element in the `sample_weight` vector. If the shape of `sample_weight` matches the shape of `y_pred`, then the loss of each measurable element of `y_pred` is scaled by the corresponding value of `sample_weight`. # Returns Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same shape as `y_true`; otherwise, it is scalar. # Raises ValueError: If the shape of `sample_weight` is invalid. """ # If we are wrapping a lambda function strip '<>' from the name as it is not # accepted in scope name. scope_name = 'lambda' if self.name == '' else self.name with K.name_scope(scope_name): losses = self.call(y_true, y_pred) return losses_utils.compute_weighted_loss( losses, sample_weight, reduction=self.reduction) @classmethod def from_config(cls, config): """Instantiates a `Loss` from its config (output of `get_config()`). # Arguments config: Output of `get_config()`. # Returns A `Loss` instance. """ return cls(**config) def get_config(self): return {'reduction': self.reduction, 'name': self.name} @abc.abstractmethod def call(self, y_true, y_pred): """Invokes the `Loss` instance. # Arguments y_true: Ground truth values, with the same shape as 'y_pred'. y_pred: The predicted values. """ raise NotImplementedError('Must be implemented in subclasses.') class LossFunctionWrapper(Loss): """Wraps a loss function in the `Loss` class. # Arguments fn: The loss function to wrap, with signature `fn(y_true, y_pred, **kwargs)`. reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) name for the loss. **kwargs: The keyword arguments that are passed on to `fn`. """ def __init__(self, fn, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name=None, **kwargs): super(LossFunctionWrapper, self).__init__(reduction=reduction, name=name) self.fn = fn self._fn_kwargs = kwargs def call(self, y_true, y_pred): """Invokes the `LossFunctionWrapper` instance. # Arguments y_true: Ground truth values. y_pred: The predicted values. # Returns Loss values per sample. """ return self.fn(y_true, y_pred, **self._fn_kwargs) def get_config(self): config = {} for k, v in six.iteritems(self._fn_kwargs): config[k] = K.eval(v) if K.is_tensor(v) or K.is_variable(v) else v base_config = super(LossFunctionWrapper, self).get_config() return dict(list(base_config.items()) + list(config.items())) class MeanSquaredError(LossFunctionWrapper): """Computes the mean of squares of errors between labels and predictions. Standalone usage: ```python mse = keras.losses.MeanSquaredError() loss = mse([0., 0., 1., 1.], [1., 1., 1., 0.]) ``` Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.MeanSquaredError()) ``` # Arguments reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) name for the loss. """ def __init__(self, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='mean_squared_error'): super(MeanSquaredError, self).__init__( mean_squared_error, name=name, reduction=reduction) class MeanAbsoluteError(LossFunctionWrapper): """Computes the mean of absolute difference between labels and predictions. Standalone usage: ```python mae = keras.losses.MeanAbsoluteError() loss = mae([0., 0., 1., 1.], [1., 1., 1., 0.]) ``` Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.MeanAbsoluteError()) ``` # Arguments reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) name for the loss. """ def __init__(self, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='mean_absolute_error'): super(MeanAbsoluteError, self).__init__( mean_absolute_error, name=name, reduction=reduction) class MeanAbsolutePercentageError(LossFunctionWrapper): """Computes the mean absolute percentage error between `y_true` and `y_pred`. Standalone usage: ```python mape = keras.losses.MeanAbsolutePercentageError() loss = mape([0., 0., 1., 1.], [1., 1., 1., 0.]) ``` Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.MeanAbsolutePercentageError()) ``` # Arguments reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) name for the loss. """ def __init__(self, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='mean_absolute_percentage_error'): super(MeanAbsolutePercentageError, self).__init__( mean_absolute_percentage_error, name=name, reduction=reduction) class MeanSquaredLogarithmicError(LossFunctionWrapper): """Computes the mean squared logarithmic error between `y_true` and `y_pred`. Standalone usage: ```python msle = keras.losses.MeanSquaredLogarithmicError() loss = msle([0., 0., 1., 1.], [1., 1., 1., 0.]) ``` Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.MeanSquaredLogarithmicError()) ``` # Arguments reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) name for the loss. """ def __init__(self, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='mean_squared_logarithmic_error'): super(MeanSquaredLogarithmicError, self).__init__( mean_squared_logarithmic_error, name=name, reduction=reduction) class BinaryCrossentropy(LossFunctionWrapper): """Computes the cross-entropy loss between true labels and predicted labels. Use this cross-entropy loss when there are only two label classes (assumed to be 0 and 1). For each example, there should be a single floating-point value per prediction. In the snippet below, each of the four examples has only a single floating-pointing value, and both `y_pred` and `y_true` have the shape `[batch_size]`. Standalone usage: ```python bce = keras.losses.BinaryCrossentropy() loss = bce([0., 0., 1., 1.], [1., 1., 1., 0.]) ``` Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.BinaryCrossentropy()) ``` # Arguments from_logits: Whether to interpret `y_pred` as a tensor of [logit](https://en.wikipedia.org/wiki/Logit) values. By default, we assume that `y_pred` contains probabilities (i.e., values in [0, 1]). label_smoothing: Float in [0, 1]. When 0, no smoothing occurs. When > 0, we compute the loss between the predicted labels and a smoothed version of the true labels, where the smoothing squeezes the labels towards 0.5. Larger values of `label_smoothing` correspond to heavier smoothing. reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) Name for the object. """ def __init__(self, from_logits=False, label_smoothing=0, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='binary_crossentropy'): super(BinaryCrossentropy, self).__init__( binary_crossentropy, name=name, reduction=reduction, from_logits=from_logits, label_smoothing=label_smoothing) self.from_logits = from_logits class CategoricalCrossentropy(LossFunctionWrapper): """Computes the crossentropy loss between the labels and predictions. Use this crossentropy loss function when there are two or more label classes. We expect labels to be provided in a `one_hot` representation. If you want to provide labels as integers, please use `SparseCategoricalCrossentropy` loss. There should be `# classes` floating point values per feature. In the snippet below, there is `# classes` floating pointing values per example. The shape of both `y_pred` and `y_true` are `[batch_size, num_classes]`. Standalone usage: ```python cce = keras.losses.CategoricalCrossentropy() loss = cce( [[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]]) ``` Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.CategoricalCrossentropy()) ``` # Arguments from_logits: Whether to interpret `y_pred` as a tensor of [logit](https://en.wikipedia.org/wiki/Logit) values. By default, we assume that `y_pred` contains probabilities (i.e., values in [0, 1]). label_smoothing: Float in [0, 1]. When 0, no smoothing occurs. When > 0, we compute the loss between the predicted labels and a smoothed version of the true labels, where the smoothing squeezes the labels towards 0.5. Larger values of `label_smoothing` correspond to heavier smoothing. reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) Name for the object. """ def __init__(self, from_logits=False, label_smoothing=0, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='categorical_crossentropy'): super(CategoricalCrossentropy, self).__init__( categorical_crossentropy, name=name, reduction=reduction, from_logits=from_logits, label_smoothing=label_smoothing) class SparseCategoricalCrossentropy(LossFunctionWrapper): """Computes the crossentropy loss between the labels and predictions. Use this crossentropy loss function when there are two or more label classes. We expect labels to be provided as integers. If you want to provide labels using `one-hot` representation, please use `CategoricalCrossentropy` loss. There should be `# classes` floating point values per feature for `y_pred` and a single floating point value per feature for `y_true`. In the snippet below, there is a single floating point value per example for `y_true` and `# classes` floating pointing values per example for `y_pred`. The shape of `y_true` is `[batch_size]` and the shape of `y_pred` is `[batch_size, num_classes]`. Standalone usage: ```python cce = keras.losses.SparseCategoricalCrossentropy() loss = cce( [0, 1, 2], [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]]) ``` Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.SparseCategoricalCrossentropy()) ``` # Arguments from_logits: Whether to interpret `y_pred` as a tensor of [logit](https://en.wikipedia.org/wiki/Logit) values. By default, we assume that `y_pred` contains probabilities (i.e., values in [0, 1]). reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) Name for the object. """ def __init__(self, from_logits=False, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='sparse_categorical_crossentropy'): super(SparseCategoricalCrossentropy, self).__init__( sparse_categorical_crossentropy, name=name, reduction=reduction, from_logits=from_logits) class Hinge(LossFunctionWrapper): """Computes the hinge loss between `y_true` and `y_pred`. `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are provided we will convert them to -1 or 1. Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.Hinge()) ``` # Arguments reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) Name for the object. """ def __init__(self, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='hinge'): super(Hinge, self).__init__(hinge, name=name, reduction=reduction) class SquaredHinge(LossFunctionWrapper): """Computes the squared hinge loss between `y_true` and `y_pred`. `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are provided we will convert them to -1 or 1. Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.SquaredHinge()) ``` # Arguments reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) Name for the object. """ def __init__(self, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='squared_hinge'): super(SquaredHinge, self).__init__( squared_hinge, name=name, reduction=reduction) class CategoricalHinge(LossFunctionWrapper): """Computes the categorical hinge loss between `y_true` and `y_pred`. Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.CategoricalHinge()) ``` # Arguments reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) Name for the object. """ def __init__(self, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='categorical_hinge'): super(CategoricalHinge, self).__init__( categorical_hinge, name=name, reduction=reduction) class Poisson(LossFunctionWrapper): """Computes the Poisson loss between `y_true` and `y_pred`. `loss = y_pred - y_true * log(y_pred)` Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.Poisson()) ``` # Arguments reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) Name for the object. """ def __init__(self, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='poisson'): super(Poisson, self).__init__(poisson, name=name, reduction=reduction) class LogCosh(LossFunctionWrapper): """Computes the logarithm of the hyperbolic cosine of the prediction error. `logcosh = log((exp(x) + exp(-x))/2)`, where x is the error (y_pred - y_true) Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.LogCosh()) ``` # Arguments reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) Name for the object. """ def __init__(self, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='logcosh'): super(LogCosh, self).__init__(logcosh, name=name, reduction=reduction) class KLDivergence(LossFunctionWrapper): """Computes Kullback-Leibler divergence loss between `y_true` and `y_pred`. `loss = y_true * log(y_true / y_pred)` See: https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.KLDivergence()) ``` # Arguments reduction: (Optional) Type of loss reduction to apply to loss. Default value is `SUM_OVER_BATCH_SIZE`. name: (Optional) Name for the object. """ def __init__(self, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='kullback_leibler_divergence'): super(KLDivergence, self).__init__( kullback_leibler_divergence, name=name, reduction=reduction) class Huber(LossFunctionWrapper): """Computes the Huber loss between `y_true` and `y_pred`. Given `x = y_true - y_pred`: ``` loss = 0.5 * x^2 if |x| <= d loss = 0.5 * d^2 + d * (|x| - d) if |x| > d ``` where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss Usage with the `compile` API: ```python model = keras.Model(inputs, outputs) model.compile('sgd', loss=keras.losses.Huber()) ``` # Arguments delta: A float, the point where the Huber loss function changes from a quadratic to linear. reduction: (Optional) Type of reduction to apply to loss. name: Optional name for the object. """ def __init__(self, delta=1.0, reduction=losses_utils.Reduction.SUM_OVER_BATCH_SIZE, name='huber_loss'): super(Huber, self).__init__( huber_loss, name=name, reduction=reduction, delta=delta) def mean_squared_error(y_true, y_pred): if not K.is_tensor(y_pred): y_pred = K.constant(y_pred) y_true = K.cast(y_true, y_pred.dtype) return K.mean(K.square(y_pred - y_true), axis=-1) def mean_absolute_error(y_true, y_pred): if not K.is_tensor(y_pred): y_pred = K.constant(y_pred) y_true = K.cast(y_true, y_pred.dtype) return K.mean(K.abs(y_pred - y_true), axis=-1) def mean_absolute_percentage_error(y_true, y_pred): if not K.is_tensor(y_pred): y_pred = K.constant(y_pred) y_true = K.cast(y_true, y_pred.dtype) diff = K.abs((y_true - y_pred) / K.clip(K.abs(y_true), K.epsilon(), None)) return 100. * K.mean(diff, axis=-1) def mean_squared_logarithmic_error(y_true, y_pred): if not K.is_tensor(y_pred): y_pred = K.constant(y_pred) y_true = K.cast(y_true, y_pred.dtype) first_log = K.log(K.clip(y_pred, K.epsilon(), None) + 1.) second_log = K.log(K.clip(y_true, K.epsilon(), None) + 1.) return K.mean(K.square(first_log - second_log), axis=-1) def squared_hinge(y_true, y_pred): y_true = _maybe_convert_labels(y_true) return K.mean(K.square(K.maximum(1. - y_true * y_pred, 0.)), axis=-1) def hinge(y_true, y_pred): y_true = _maybe_convert_labels(y_true) return K.mean(K.maximum(1. - y_true * y_pred, 0.), axis=-1) def categorical_hinge(y_true, y_pred): pos = K.sum(y_true * y_pred, axis=-1) neg = K.max((1. - y_true) * y_pred, axis=-1) return K.maximum(0., neg - pos + 1.) def logcosh(y_true, y_pred): """Logarithm of the hyperbolic cosine of the prediction error. `log(cosh(x))` is approximately equal to `(x ** 2) / 2` for small `x` and to `abs(x) - log(2)` for large `x`. This means that 'logcosh' works mostly like the mean squared error, but will not be so strongly affected by the occasional wildly incorrect prediction. # Arguments y_true: tensor of true targets. y_pred: tensor of predicted targets. # Returns Tensor with one scalar loss entry per sample. """ def _logcosh(x): return x + K.softplus(-2. * x) - K.log(2.) return K.mean(_logcosh(y_pred - y_true), axis=-1) def huber_loss(y_true, y_pred, delta=1.0): error = y_pred - y_true abs_error = K.abs(error) quadratic = K.minimum(abs_error, delta) linear = abs_error - quadratic return 0.5 * K.square(quadratic) + delta * linear def categorical_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0): y_pred = K.constant(y_pred) if not K.is_tensor(y_pred) else y_pred y_true = K.cast(y_true, y_pred.dtype) if label_smoothing is not 0: smoothing = K.cast_to_floatx(label_smoothing) def _smooth_labels(): num_classes = K.cast(K.shape(y_true)[1], y_pred.dtype) return y_true * (1.0 - smoothing) + (smoothing / num_classes) y_true = K.switch(K.greater(smoothing, 0), _smooth_labels, lambda: y_true) return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits) def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1): return K.sparse_categorical_crossentropy( y_true, y_pred, from_logits=from_logits, axis=axis) def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0): y_pred = K.constant(y_pred) if not K.is_tensor(y_pred) else y_pred y_true = K.cast(y_true, y_pred.dtype) if label_smoothing is not 0: smoothing = K.cast_to_floatx(label_smoothing) y_true = K.switch(K.greater(smoothing, 0), lambda: y_true * (1.0 - smoothing) + 0.5 * smoothing, lambda: y_true) return K.mean( K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1) def kullback_leibler_divergence(y_true, y_pred): y_true = K.clip(y_true, K.epsilon(), 1) y_pred = K.clip(y_pred, K.epsilon(), 1) return K.sum(y_true * K.log(y_true / y_pred), axis=-1) def poisson(y_true, y_pred): return K.mean(y_pred - y_true * K.log(y_pred + K.epsilon()), axis=-1) def cosine_proximity(y_true, y_pred, axis=-1): y_true = K.l2_normalize(y_true, axis=axis) y_pred = K.l2_normalize(y_pred, axis=axis) return - K.sum(y_true * y_pred, axis=axis) def _maybe_convert_labels(y_true): """Converts binary labels into -1/1.""" are_zeros = K.equal(y_true, 0) are_ones = K.equal(y_true, 1) are_zeros = K.expand_dims(are_zeros, 0) are_ones = K.expand_dims(are_ones, 0) are_different = K.concatenate([are_zeros, are_ones], axis=0) are_different = K.any(are_different, axis=0) is_binary = K.all(are_different) def _convert_binary_labels(): # Convert the binary labels to -1 or 1. return 2. * y_true - 1. updated_y_true = K.switch(is_binary, _convert_binary_labels, lambda: y_true) return updated_y_true # Aliases. mse = MSE = mean_squared_error mae = MAE = mean_absolute_error mape = MAPE = mean_absolute_percentage_error msle = MSLE = mean_squared_logarithmic_error kld = KLD = kullback_leibler_divergence cosine = cosine_similarity = cosine_proximity def is_categorical_crossentropy(loss): return (isinstance(loss, CategoricalCrossentropy) or (isinstance(loss, LossFunctionWrapper) and loss.fn == categorical_crossentropy) or (hasattr(loss, '__name__') and loss.__name__ == 'categorical_crossentropy') or loss == 'categorical_crossentropy') def serialize(loss): return serialize_keras_object(loss) def deserialize(name, custom_objects=None): return deserialize_keras_object(name, module_objects=globals(), custom_objects=custom_objects, printable_module_name='loss function') def get(identifier): """Get the `identifier` loss function. # Arguments identifier: None or str, name of the function. # Returns The loss function or None if `identifier` is None. # Raises ValueError if unknown identifier. """ if identifier is None: return None if isinstance(identifier, six.string_types): identifier = str(identifier) return deserialize(identifier) if isinstance(identifier, dict): return deserialize(identifier) elif callable(identifier): return identifier else: raise ValueError('Could not interpret ' 'loss function identifier:', identifier)