# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Eager-graph unified check numerics callback."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import threading

import numpy as np

from tensorflow.core.protobuf import debug_event_pb2
from tensorflow.python.debug.lib import op_callbacks_common
from tensorflow.python.debug.lib import source_utils
from tensorflow.python.eager import monitoring
from tensorflow.python.framework import op_callbacks
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
from tensorflow.python.ops import gen_debug_ops
from tensorflow.python.platform import tf_logging as logging
from tensorflow.python.util import compat
from tensorflow.python.util.tf_export import tf_export


# Many ops have benign NaN outputs, and running them with check_numerics
# on will create unwanted errors
# TODO(b/142497024): Replace this allowlist with function decorators in the ops
IGNORE_OP_OUTPUTS = (
    # For FusedBatchNorm, if the input tensor is empty then batch_mean and
    # batch_variance will be NaN. reserve_space holds intermediate values
    # derived from batch_mean and batch_variance used for gradient calculation
    (b"FusedBatchNorm", 1),  # batch_mean
    (b"FusedBatchNorm", 2),  # batch_variance
    (b"FusedBatchNorm", 3),  # reserve_space_1
    (b"FusedBatchNorm", 4),  # reserve_space_2

    # Same as above
    (b"FusedBatchNormV2", 1),  # batch_mean
    (b"FusedBatchNormV2", 2),  # batch_variance
    (b"FusedBatchNormV2", 3),  # reserve_space_1
    (b"FusedBatchNormV2", 4),  # reserve_space_2

    # Same as above, but reserve_space_3 holds additional intermediate values
    (b"FusedBatchNormV3", 1),  # batch_mean
    (b"FusedBatchNormV3", 2),  # batch_variance
    (b"FusedBatchNormV3", 3),  # reserve_space_1
    (b"FusedBatchNormV3", 4),  # reserve_space_2
    (b"FusedBatchNormV3", 5),  # reserve_space_3
)

# Some frequently used ops are generally safe and we can skip them to reduce
# overhead. NOTE: This list is compiled by observing operations called by
# models in practice and is not a comprehensive list of safe operations.
SAFE_OPS = (
    b"Concat",
    b"ConcatV2",
    b"ExpandDims",
    b"Fill",
    b"Gather",
    b"Maximum",
    b"Minimum",
    b"Reshape",
    b"Slice",
    b"Squeeze",
    b"Stack",
    b"StridedSlice",
    b"StridedSliceGrad",
    b"TensorListConcatV2",
    b"TensorListGather",
    b"TensorListGetItem",
    b"TensorListPopBack",
    b"TensorListStack",
    b"Transpose",
    b"Unpack",
)

_state = threading.local()

_check_numerics_callback_create_counter = monitoring.Counter(
    "/tensorflow/api/python/debugging/check_numerics_callback_create_counter",
    "Counter for number of times the check_numerics op callback is created.")


def limit_string_length(string, max_len=50):
  """Limit the length of input string.

  Args:
    string: Input string.
    max_len: (int or None) If int, the length limit. If None, no limit.

  Returns:
    Possibly length-limited string.
  """
  if max_len is None or len(string) <= max_len:
    return string
  else:
    return "..." + string[len(string) - max_len:]


# A dictionary that supports looking up the original input tensor names.
_CHECK_NUMERICS_INPUT_LOOKUP = collections.defaultdict(dict)


def _maybe_lookup_original_input_tensor(graph, tensor):
  if (graph and
      graph in _CHECK_NUMERICS_INPUT_LOOKUP and
      tensor.name in _CHECK_NUMERICS_INPUT_LOOKUP[graph]):
    return _CHECK_NUMERICS_INPUT_LOOKUP[graph][tensor.name]
  else:
    return tensor


def get_check_numerics_error_message(slot,
                                     num_outputs,
                                     op_type,
                                     tensor,
                                     inputs,
                                     graph=None,
                                     traceback=None,
                                     stack_height_limit=30,
                                     path_length_limit=50):
  """Create a meaningful and user-friendly error message about offending tensor.

  The error message reveals the following info about the op that outputs
  NaN/Infinity: dtype, shape (to the extent known at graph-construction time),
  input tensors, stack trace for op creation (if is graph mode).

  Args:
    slot: (int) slot index of the tensor output.
    num_outputs: (int) total number of outputs of the op.
    op_type: (str) Type of the that generates `tensor`.
    tensor: (Tensor) the offending tensor, i.e., the tensor that contains
      Infinities or NaNs.
    inputs: (array of Tensor) inputs to the op that generates `tensor`.
    graph: (tf.Graph) the graph object that `tensor` belongs to. Available only
      under graph mode.
    traceback: (list of trace frames) the stack trace of the op's creation.
      Available only under graph model.
    stack_height_limit: (int or None) If int, limit to the height of the stack
      trace printed in the error message. If None, no limit to the height.
    path_length_limit: (int or None) Length limit for file paths included in the
      formatted stack trace.

  Returns:
    (str) A formatted error message.
  """
  eager_vs_graph_qualifier = "graph" if graph else "eagerly-executing"
  message = "\n"
  message += (
      "\n!!! Detected Infinity or NaN in output %d of "
      "%s op \"%s\" (# of outputs: %d) !!!\n" %
      (slot, eager_vs_graph_qualifier, op_type, num_outputs))

  message += "  dtype: %s\n" % tensor.dtype
  message += "  shape: %s\n" % (tensor.shape,)

  if not graph:
    # This is an eager tensor. We can get its numpy value and count
    # NaNs and Infs.
    is_inf = np.isinf(tensor)

    num_neg_inf = np.sum(np.logical_and(np.less(tensor, 0.), is_inf))
    num_pos_inf = np.sum(np.logical_and(np.greater(tensor, 0.), is_inf))
    num_nan = np.sum(np.isnan(tensor))
    if num_neg_inf > 0:
      message += "  # of -Inf elements: %s\n" % num_neg_inf
    if num_pos_inf > 0:
      message += "  # of +Inf elements: %s\n" % num_pos_inf
    if num_nan:
      message += "  # of +NaN elements: %s\n" % num_nan

  if len(inputs) > 1:
    message += "\n  Input tensors (%d):\n" % len(inputs)
    for slot, input_tensor in enumerate(inputs):
      message += "         %d: %s\n" % (
          slot, _maybe_lookup_original_input_tensor(graph, input_tensor))
  elif len(inputs) == 1:
    message += "\n  Input tensor: %s\n" % (
        _maybe_lookup_original_input_tensor(graph, inputs[0]))
  if graph and hasattr(graph, "name") and graph.name:
    message += "  Graph name: \"%s\"\n" % graph.name

  # Format the stack trace for the op's creation. We omit files that
  # belong to tensorflow itself.
  if graph and traceback:
    message += (
        "\n  Stack trace of op's creation (\"->\": inferred user code):\n")
    if stack_height_limit is not None and len(traceback) > stack_height_limit:
      num_omitted_frames = len(traceback) - stack_height_limit
      message += "    + ... (Omitted %d frames)\n" % num_omitted_frames
    for filepath, lineno, function_name, source_line in traceback[
        -stack_height_limit:]:
      user_code_indicator = "    "
      if not source_utils.guess_is_tensorflow_py_library(filepath):
        user_code_indicator = " -> "

      message += "    + %s (L%d) %s\n" % (
          limit_string_length(filepath, path_length_limit), lineno,
          function_name)
      if source_line is not None:
        message += "%s|   %s\n" % (user_code_indicator, source_line)
  message += "\n"
  return message


def _debug_summary(x):
  return gen_debug_ops.debug_numeric_summary_v2(
      x,
      tensor_debug_mode=(
          debug_event_pb2.TensorDebugMode.REDUCE_INF_NAN_THREE_SLOTS))


class CheckNumericsCallback(object):
  """Wrapper for the numerics-checking callback for thread locality."""

  def __init__(self, stack_height_limit, path_length_limit):
    self._stack_height_limit = stack_height_limit
    self._path_length_limit = path_length_limit
    # A dict mapping Placeholder tensors to their instrumenting debug tensors.
    # Used only under V1 graph mode, where we can't rely on auto control
    # dependency to execute the debug tensors and hence need to attach the debug
    # tensors as control dependencies of the ops that consume the Placeholder.
    self._placeholder_to_debug_tensor = dict()

  def callback(self,
               op_type,
               inputs,
               attrs,
               outputs,
               op_name=None,
               graph=None):
    """Eager-function unified callback for checking numerics."""
    del attrs, op_name  # Unused
    op_type_bytes = compat.as_bytes(op_type)
    is_v1_graph_mode = not ops.executing_eagerly_outside_functions()
    if (op_type_bytes in op_callbacks_common.OP_CALLBACK_SKIP_OPS or
        op_type_bytes in SAFE_OPS):
      return None
    if graph:
      # Under graph mode. Insert check_numerics op.
      instrumented_outputs = []
      if is_v1_graph_mode:
        for input_tensor in inputs:
          if input_tensor in self._placeholder_to_debug_tensor and outputs:
            outputs[0].op._add_control_input(  # pylint: disable=protected-access
                self._placeholder_to_debug_tensor[input_tensor].op)
      for slot, output in enumerate(outputs):
        if (output.dtype.is_floating and
            (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS):
          checked_output = array_ops.check_numerics_v2(
              # TF v2 has automatic control dependencies added to stateful async
              # ops, which allows us to run check_numerics asynchronously.
              # In the above case we use debug_summary to reduce all output
              # tensors asynchronously from the op being checked and then
              # process the tensor summary with check_numerics.
              output if is_v1_graph_mode else _debug_summary(output),
              get_check_numerics_error_message(
                  slot,
                  len(outputs),
                  op_type,
                  output,
                  inputs,
                  graph=graph,
                  traceback=output.op.traceback,
                  stack_height_limit=self._stack_height_limit,
                  path_length_limit=self._path_length_limit))
          _CHECK_NUMERICS_INPUT_LOOKUP[graph][checked_output.name] = output
          instrumented_outputs.append(self._get_output_tensor(
              op_type_bytes, output, checked_output, is_v1_graph_mode))
        else:
          instrumented_outputs.append(output)
      return instrumented_outputs
    else:
      if op_type_bytes == b"CheckNumericsV2":
        # TODO(b/140334369): Remove this special casing logic once op_callback.
        # automatically prevents infinite recursion in eager mode.
        return None
      # Under eager mode. Eagerly execute check_numerics op.
      for slot, output in enumerate(outputs):
        if (output.dtype.is_floating and
            (op_type_bytes, slot) not in IGNORE_OP_OUTPUTS):
          array_ops.check_numerics_v2(
              output,
              get_check_numerics_error_message(
                  slot, len(outputs), op_type, output, inputs,
                  stack_height_limit=self._stack_height_limit,
                  path_length_limit=self._path_length_limit))

  def _get_output_tensor(self,
                         op_type,
                         tensor,
                         checked_tensor,
                         is_v1_graph_mode):
    """Determine what tensor to output from callback.

    Args:
      op_type: Type of the op that outputs the original symbolic tensor, as
        `bytes`.
      tensor: The original output symbolic tensor.
      checked_tensor: The debugger-instrumented, numerics-checking tensor.
      is_v1_graph_mode: Whether the debugged proggram is running under V1 graph
        mode.

    Returns:
      A symbolic tensor to be returned by the dumping op_callback.
    """
    if is_v1_graph_mode:
      # Placeholders need special treatment under V1 graph mode. The
      # callback can't simply override the Placeholder tensor to the debug
      # tensor, as that would cause the Placeholder op to lack a value.
      # The debug tensor is remembered and will be attached as control
      # inputs to ops that consumer the Placeholders later.
      if op_type == b"Placeholder":
        self._placeholder_to_debug_tensor[tensor] = checked_tensor
        return tensor
      else:
        return checked_tensor
    else:
      # Under non-v1 graph mode, rely on auto control dependency to run the
      # checked tensor.
      return tensor


@tf_export("debugging.enable_check_numerics")
def enable_check_numerics(stack_height_limit=30,
                          path_length_limit=50):
  r"""Enable tensor numerics checking in an eager/graph unified fashion.

  The numerics checking mechanism will cause any TensorFlow eager execution or
  graph execution to error out as soon as an op's output tensor contains
  infinity or NaN.

  This method is idempotent. Calling it multiple times has the same effect
  as calling it once.

  This method takes effect only on the thread in which it is called.

  When a op's float-type output tensor contains any Infinity or NaN, an
  `tf.errors.InvalidArgumentError` will be thrown, with an error message that
  reveals the following information:
    - The type of the op that generated the tensor with bad numerics.
    - Data type (dtype) of the tensor.
    - Shape of the tensor (to the extent known at the time of eager execution
      or graph construction).
    - Name of the containing graph (if available).
    - (Graph mode only): The stack trace of the intra-graph op's creation,
      with a stack-height limit and a path-length limit for visual clarity.
      The stack frames that belong to the user's code (as opposed to
      tensorflow's internal code) are highlighted with a text arrow ("->").
    - (Eager mode only): How many of the offending tensor's elements are
      `Infinity` and `NaN`, respectively.

  Once enabled, the check-numerics mechanism can be disabled by using
  `tf.debugging.disable_check_numerics()`.

  Example usage:

  1. Catching infinity during the execution of a `tf.function` graph:

     ```py
     import tensorflow as tf

     tf.debugging.enable_check_numerics()

     @tf.function
     def square_log_x_plus_1(x):
       v = tf.math.log(x + 1)
       return tf.math.square(v)

     x = -1.0

     # When the following line runs, a function graph will be compiled
     # from the Python function `square_log_x_plus_1()`. Due to the
     # `enable_check_numerics()` call above, the graph will contain
     # numerics checking ops that will run during the function graph's
     # execution. The function call generates an -infinity when the Log
     # (logarithm) op operates on the output tensor of the Add op.
     # The program errors out at this line, printing an error message.
     y = square_log_x_plus_1(x)
     z = -y
    ```

  2. Catching NaN during eager execution:

     ```py
     import numpy as np
     import tensorflow as tf

     tf.debugging.enable_check_numerics()

     x = np.array([[0.0, -1.0], [4.0, 3.0]])

     # The following line executes the Sqrt op eagerly. Due to the negative
     # element in the input array, a NaN is generated. Due to the
     # `enable_check_numerics()` call above, the program errors immediately
     # at this line, printing an error message.
     y = tf.math.sqrt(x)
     z = tf.matmul(y, y)
     ```

  NOTE: If your code is running on TPUs, be sure to call
  `tf.config.set_soft_device_placement(True)` before calling
  `tf.debugging.enable_check_numerics()` as this API uses automatic outside
  compilation on TPUs. For example:

  ```py
  tf.config.set_soft_device_placement(True)
  tf.debugging.enable_check_numerics()

  resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
  strategy = tf.distribute.TPUStrategy(resolver)
  with strategy.scope():
    # ...
  ```

  Args:
    stack_height_limit: Limit to the height of the printed stack trace.
      Applicable only to ops in `tf.function`s (graphs).
    path_length_limit: Limit to the file path included in the printed stack
      trace. Applicable only to ops in `tf.function`s (graphs).
  """
  if not hasattr(_state, "check_numerics_callback"):
    _state.check_numerics_callback = CheckNumericsCallback(
        stack_height_limit, path_length_limit)
  op_callbacks.add_op_callback(_state.check_numerics_callback.callback)

  logging.info(
      "Enabled check-numerics callback in thread %s",
      threading.current_thread().name)
  _check_numerics_callback_create_counter.get_cell().increase_by(1)


@tf_export("debugging.disable_check_numerics")
def disable_check_numerics():
  """Disable the eager/graph unified numerics checking mechanism.

  This method can be used after a call to `tf.debugging.enable_check_numerics()`
  to disable the numerics-checking mechanism that catches infinity and NaN
  values output by ops executed eagerly or in tf.function-compiled graphs.

  This method is idempotent. Calling it multiple times has the same effect
  as calling it once.

  This method takes effect only on the thread in which it is called.
  """
  if not hasattr(_state, "check_numerics_callback"):
    return
  try:
    op_callbacks.remove_op_callback(_state.check_numerics_callback.callback)
    delattr(_state, "check_numerics_callback")
    logging.info(
        "Disabled check-numerics callback in thread %s",
        threading.current_thread().name)
  except KeyError:
    # Tolerate disabling the check numerics callback without
    # enable_check_numerics() being called first.
    pass