# Copyright 2016 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """TensorFlow Debugger (tfdbg) Utilities.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import re from six.moves import xrange # pylint: disable=redefined-builtin def add_debug_tensor_watch(run_options, node_name, output_slot=0, debug_ops="DebugIdentity", debug_urls=None, tolerate_debug_op_creation_failures=False, global_step=-1): """Add watch on a `Tensor` to `RunOptions`. N.B.: 1. Under certain circumstances, the `Tensor` may not get actually watched (e.g., if the node of the `Tensor` is constant-folded during runtime). 2. For debugging purposes, the `parallel_iteration` attribute of all `tf.while_loop`s in the graph are set to 1 to prevent any node from being executed multiple times concurrently. This change does not affect subsequent non-debugged runs of the same `tf.while_loop`s. Args: run_options: An instance of `config_pb2.RunOptions` to be modified. node_name: (`str`) name of the node to watch. output_slot: (`int`) output slot index of the tensor from the watched node. debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s). Can be a `list` of `str` or a single `str`. The latter case is equivalent to a `list` of `str` with only one element. For debug op types with customizable attributes, each debug op string can optionally contain a list of attribute names, in the syntax of: debug_op_name(attr_name_1=attr_value_1;attr_name_2=attr_value_2;...) debug_urls: (`str` or `list` of `str`) URL(s) to send debug values to, e.g., `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`. tolerate_debug_op_creation_failures: (`bool`) Whether to tolerate debug op creation failures by not throwing exceptions. global_step: (`int`) Optional global_step count for this debug tensor watch. """ watch_opts = run_options.debug_options.debug_tensor_watch_opts run_options.debug_options.global_step = global_step watch = watch_opts.add() watch.tolerate_debug_op_creation_failures = ( tolerate_debug_op_creation_failures) watch.node_name = node_name watch.output_slot = output_slot if isinstance(debug_ops, str): debug_ops = [debug_ops] watch.debug_ops.extend(debug_ops) if debug_urls: if isinstance(debug_urls, str): debug_urls = [debug_urls] watch.debug_urls.extend(debug_urls) def watch_graph(run_options, graph, debug_ops="DebugIdentity", debug_urls=None, node_name_regex_allowlist=None, op_type_regex_allowlist=None, tensor_dtype_regex_allowlist=None, tolerate_debug_op_creation_failures=False, global_step=-1, reset_disk_byte_usage=False): """Add debug watches to `RunOptions` for a TensorFlow graph. To watch all `Tensor`s on the graph, let both `node_name_regex_allowlist` and `op_type_regex_allowlist` be the default (`None`). N.B.: 1. Under certain circumstances, the `Tensor` may not get actually watched (e.g., if the node of the `Tensor` is constant-folded during runtime). 2. For debugging purposes, the `parallel_iteration` attribute of all `tf.while_loop`s in the graph are set to 1 to prevent any node from being executed multiple times concurrently. This change does not affect subsequent non-debugged runs of the same `tf.while_loop`s. Args: run_options: An instance of `config_pb2.RunOptions` to be modified. graph: An instance of `ops.Graph`. debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s) to use. debug_urls: URLs to send debug values to. Can be a list of strings, a single string, or None. The case of a single string is equivalent to a list consisting of a single string, e.g., `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`. For debug op types with customizable attributes, each debug op name string can optionally contain a list of attribute names, in the syntax of: debug_op_name(attr_name_1=attr_value_1;attr_name_2=attr_value_2;...) node_name_regex_allowlist: Regular-expression allowlist for node_name, e.g., `"(weight_[0-9]+|bias_.*)"` op_type_regex_allowlist: Regular-expression allowlist for the op type of nodes, e.g., `"(Variable|Add)"`. If both `node_name_regex_allowlist` and `op_type_regex_allowlist` are set, the two filtering operations will occur in a logical `AND` relation. In other words, a node will be included if and only if it hits both allowlists. tensor_dtype_regex_allowlist: Regular-expression allowlist for Tensor data type, e.g., `"^int.*"`. This allowlist operates in logical `AND` relations to the two allowlists above. tolerate_debug_op_creation_failures: (`bool`) whether debug op creation failures (e.g., due to dtype incompatibility) are to be tolerated by not throwing exceptions. global_step: (`int`) Optional global_step count for this debug tensor watch. reset_disk_byte_usage: (`bool`) whether to reset the tracked disk byte usage to zero (default: `False`). """ if not debug_ops: raise ValueError("debug_ops must not be empty or None.") if not debug_urls: raise ValueError("debug_urls must not be empty or None.") if isinstance(debug_ops, str): debug_ops = [debug_ops] node_name_pattern = ( re.compile(node_name_regex_allowlist) if node_name_regex_allowlist else None) op_type_pattern = ( re.compile(op_type_regex_allowlist) if op_type_regex_allowlist else None) tensor_dtype_pattern = ( re.compile(tensor_dtype_regex_allowlist) if tensor_dtype_regex_allowlist else None) ops = graph.get_operations() for op in ops: # Skip nodes without any output tensors. if not op.outputs: continue node_name = op.name op_type = op.type if node_name_pattern and not node_name_pattern.match(node_name): continue if op_type_pattern and not op_type_pattern.match(op_type): continue for slot in xrange(len(op.outputs)): if (tensor_dtype_pattern and not tensor_dtype_pattern.match(op.outputs[slot].dtype.name)): continue add_debug_tensor_watch( run_options, node_name, output_slot=slot, debug_ops=debug_ops, debug_urls=debug_urls, tolerate_debug_op_creation_failures=( tolerate_debug_op_creation_failures), global_step=global_step) # If no filter for node or tensor is used, will add a wildcard node name, so # that all nodes, including the ones created internally by TensorFlow itself # (e.g., by Grappler), can be watched during debugging. use_node_name_wildcard = (not node_name_pattern and not op_type_pattern and not tensor_dtype_pattern) if use_node_name_wildcard: add_debug_tensor_watch( run_options, "*", output_slot=-1, debug_ops=debug_ops, debug_urls=debug_urls, tolerate_debug_op_creation_failures=tolerate_debug_op_creation_failures, global_step=global_step) run_options.debug_options.reset_disk_byte_usage = reset_disk_byte_usage def watch_graph_with_denylists(run_options, graph, debug_ops="DebugIdentity", debug_urls=None, node_name_regex_denylist=None, op_type_regex_denylist=None, tensor_dtype_regex_denylist=None, tolerate_debug_op_creation_failures=False, global_step=-1, reset_disk_byte_usage=False): """Add debug tensor watches, denylisting nodes and op types. This is similar to `watch_graph()`, but the node names and op types are denylisted, instead of allowlisted. N.B.: 1. Under certain circumstances, the `Tensor` may not get actually watched (e.g., if the node of the `Tensor` is constant-folded during runtime). 2. For debugging purposes, the `parallel_iteration` attribute of all `tf.while_loop`s in the graph are set to 1 to prevent any node from being executed multiple times concurrently. This change does not affect subsequent non-debugged runs of the same `tf.while_loop`s. Args: run_options: An instance of `config_pb2.RunOptions` to be modified. graph: An instance of `ops.Graph`. debug_ops: (`str` or `list` of `str`) name(s) of the debug op(s) to use. See the documentation of `watch_graph` for more details. debug_urls: URL(s) to send debug values to, e.g., `file:///tmp/tfdbg_dump_1`, `grpc://localhost:12345`. node_name_regex_denylist: Regular-expression denylist for node_name. This should be a string, e.g., `"(weight_[0-9]+|bias_.*)"`. op_type_regex_denylist: Regular-expression denylist for the op type of nodes, e.g., `"(Variable|Add)"`. If both node_name_regex_denylist and op_type_regex_denylist are set, the two filtering operations will occur in a logical `OR` relation. In other words, a node will be excluded if it hits either of the two denylists; a node will be included if and only if it hits neither of the denylists. tensor_dtype_regex_denylist: Regular-expression denylist for Tensor data type, e.g., `"^int.*"`. This denylist operates in logical `OR` relations to the two allowlists above. tolerate_debug_op_creation_failures: (`bool`) whether debug op creation failures (e.g., due to dtype incompatibility) are to be tolerated by not throwing exceptions. global_step: (`int`) Optional global_step count for this debug tensor watch. reset_disk_byte_usage: (`bool`) whether to reset the tracked disk byte usage to zero (default: `False`). """ if isinstance(debug_ops, str): debug_ops = [debug_ops] node_name_pattern = ( re.compile(node_name_regex_denylist) if node_name_regex_denylist else None) op_type_pattern = ( re.compile(op_type_regex_denylist) if op_type_regex_denylist else None) tensor_dtype_pattern = ( re.compile(tensor_dtype_regex_denylist) if tensor_dtype_regex_denylist else None) ops = graph.get_operations() for op in ops: # Skip nodes without any output tensors. if not op.outputs: continue node_name = op.name op_type = op.type if node_name_pattern and node_name_pattern.match(node_name): continue if op_type_pattern and op_type_pattern.match(op_type): continue for slot in xrange(len(op.outputs)): if (tensor_dtype_pattern and tensor_dtype_pattern.match(op.outputs[slot].dtype.name)): continue add_debug_tensor_watch( run_options, node_name, output_slot=slot, debug_ops=debug_ops, debug_urls=debug_urls, tolerate_debug_op_creation_failures=( tolerate_debug_op_creation_failures), global_step=global_step) run_options.debug_options.reset_disk_byte_usage = reset_disk_byte_usage