from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import cntk as C
import numpy as np
from .common import floatx
from .common import epsilon
from .common import image_data_format
from .common import normalize_data_format
from ..utils.generic_utils import transpose_shape
from collections import defaultdict
from contextlib import contextmanager
import warnings


C.set_global_option('align_axis', 1)

b_any = any
py_slice = slice


dev = C.device.use_default_device()
if dev.type() == 0:
    warnings.warn(
        'CNTK backend warning: GPU is not detected. '
        'CNTK\'s CPU version is not fully optimized,'
        'please run with GPU to get better performance.')

# A learning phase is a bool tensor used to run Keras models in
# either train mode (learning_phase == 1) or test mode (learning_phase == 0).
# LEARNING_PHASE_PLACEHOLDER is the placeholder for dynamic learning phase
_LEARNING_PHASE_PLACEHOLDER = C.constant(
    shape=(), dtype=np.float32,
    value=1.0,
    name='_keras_learning_phase')
# static learning phase flag, if it is not 0 or 1, we will go with dynamic
# learning phase tensor.
_LEARNING_PHASE = -1
_UID_PREFIXES = defaultdict(int)

# cntk doesn't support gradient as symbolic op, to hook up with keras model,
# we will create gradient as a constant placeholder, here use this global
# map to keep the mapping from grad placeholder to parameter
grad_parameter_dict = {}

NAME_SCOPE_STACK = []


@contextmanager
def name_scope(name):
    global NAME_SCOPE_STACK
    NAME_SCOPE_STACK.append(name)
    yield
    NAME_SCOPE_STACK.pop()


def get_uid(prefix=''):
    _UID_PREFIXES[prefix] += 1
    return _UID_PREFIXES[prefix]


def learning_phase():
    # If _LEARNING_PHASE is not 0 or 1, return dynamic learning phase tensor
    if _LEARNING_PHASE in {0, 1}:
        return _LEARNING_PHASE
    else:
        return _LEARNING_PHASE_PLACEHOLDER


def set_learning_phase(value):
    global _LEARNING_PHASE
    if value not in {0, 1}:
        raise ValueError('CNTK Backend: Set learning phase '
                         'with value %s is not supported, '
                         'expected 0 or 1.' % value)
    _LEARNING_PHASE = value


def clear_session():
    """Reset learning phase flag for cntk backend.
    """
    global _LEARNING_PHASE
    global _LEARNING_PHASE_PLACEHOLDER
    _LEARNING_PHASE = -1
    _LEARNING_PHASE_PLACEHOLDER.value = np.asarray(1.0)


def in_train_phase(x, alt, training=None):
    global _LEARNING_PHASE
    if training is None:
        training = learning_phase()
        uses_learning_phase = True
    else:
        uses_learning_phase = False

    # CNTK currently don't support cond op, so here we use
    # element_select approach as workaround. It may have
    # perf issue, will resolve it later with cntk cond op.
    if callable(x) and isinstance(x, C.cntk_py.Function) is False:
        x = x()
    if callable(alt) and isinstance(alt, C.cntk_py.Function) is False:
        alt = alt()

    if training is True:
        x._uses_learning_phase = uses_learning_phase
        return x
    else:
        # if _LEARNING_PHASE is static
        if isinstance(training, int) or isinstance(training, bool):
            result = x if training == 1 or training is True else alt
        else:
            result = C.element_select(training, x, alt)
        result._uses_learning_phase = uses_learning_phase
        return result


def in_test_phase(x, alt, training=None):
    return in_train_phase(alt, x, training=training)


def _convert_string_dtype(dtype):
    if dtype == 'float32':
        return np.float32
    elif dtype == 'float64':
        return np.float64
    elif dtype == 'float16':
        return np.float16
    else:
        # cntk only running with float,
        # try to cast to float to run the model
        return np.float32


def _convert_dtype_string(dtype):
    if dtype == np.float32:
        return 'float32'
    elif dtype == np.float64:
        return 'float64'
    elif dtype == np.float16:
        return 'float16'
    else:
        raise ValueError('CNTK Backend: Unsupported dtype: %s. '
                         'CNTK only supports float32, float64, and '
                         'float16.' % dtype)


def variable(value, dtype=None, name=None, constraint=None):
    """Instantiates a variable and returns it.

    # Arguments
        value: Numpy array, initial value of the tensor.
        dtype: Tensor type.
        name: Optional name string for the tensor.
        constraint: Optional projection function to be
            applied to the variable after an optimizer update.

    # Returns
        A variable instance (with Keras metadata included).
    """
    if dtype is None:
        dtype = floatx()

    if name is None:
        name = ''

    if isinstance(
            value,
            C.variables.Constant) or isinstance(
            value,
            C.variables.Parameter):
        value = value.value

    # we don't support init parameter with symbolic op, so eval it first as
    # workaround
    if isinstance(value, C.cntk_py.Function):
        value = eval(value)

    shape = value.shape if hasattr(value, 'shape') else ()
    if hasattr(value, 'dtype') and value.dtype != dtype and len(shape) > 0:
        value = value.astype(dtype)

    # TODO: remove the conversion when cntk supports int32, int64
    # https://www.cntk.ai/pythondocs/cntk.variables.html#cntk.variables.Parameter
    dtype = 'float32' if 'int' in str(dtype) else dtype

    v = C.parameter(shape=shape,
                    init=value,
                    dtype=dtype,
                    name=_prepare_name(name, 'variable'))
    v._keras_shape = v.shape
    v._uses_learning_phase = False
    v.constraint = constraint
    return v


def is_variable(x):
    return isinstance(x, C.variables.Parameter)


def bias_add(x, bias, data_format=None):
    data_format = normalize_data_format(data_format)

    dims = len(x.shape)
    if dims > 0 and x.shape[0] == C.InferredDimension:
        dims -= 1

    bias_dims = len(bias.shape)
    if bias_dims != 1 and bias_dims != dims:
        raise ValueError('Unexpected bias dimensions %d, '
                         'expected 1 or %d dimensions' % (bias_dims, dims))

    if dims == 4:
        if data_format == 'channels_first':
            if bias_dims == 1:
                shape = (bias.shape[0], 1, 1, 1)
            else:
                shape = (bias.shape[3],) + bias.shape[:3]
        elif data_format == 'channels_last':
            if bias_dims == 1:
                shape = (1, 1, 1, bias.shape[0])
            else:
                shape = bias.shape
    elif dims == 3:
        if data_format == 'channels_first':
            if bias_dims == 1:
                shape = (bias.shape[0], 1, 1)
            else:
                shape = (bias.shape[2],) + bias.shape[:2]
        elif data_format == 'channels_last':
            if bias_dims == 1:
                shape = (1, 1, bias.shape[0])
            else:
                shape = bias.shape
    elif dims == 2:
        if data_format == 'channels_first':
            if bias_dims == 1:
                shape = (bias.shape[0], 1)
            else:
                shape = (bias.shape[1],) + bias.shape[:1]
        elif data_format == 'channels_last':
            if bias_dims == 1:
                shape = (1, bias.shape[0])
            else:
                shape = bias.shape
    else:
        shape = bias.shape
    return x + reshape(bias, shape)


def eval(x):
    if isinstance(x, C.cntk_py.Function):
        return x.eval()
    elif (isinstance(x, C.variables.Constant) or isinstance(
            x, C.variables.Parameter)):
        return x.value
    else:
        raise ValueError('CNTK Backend: `eval` method on '
                         '`%s` type is not supported. '
                         'CNTK only supports `eval` with '
                         '`Function`, `Constant` or '
                         '`Parameter`.' % type(x))


def placeholder(
        shape=None,
        ndim=None,
        dtype=None,
        sparse=False,
        name=None,
        dynamic_axis_num=1):
    if dtype is None:
        dtype = floatx()
    if not shape:
        if ndim:
            shape = tuple([None for _ in range(ndim)])

    if _get_cntk_version() >= 2.2:
        dynamic_dimension = C.FreeDimension
    else:
        dynamic_dimension = C.InferredDimension

    cntk_shape = [dynamic_dimension if s is None else s for s in shape]
    cntk_shape = tuple(cntk_shape)

    if dynamic_axis_num > len(cntk_shape):
        raise ValueError('CNTK backend: creating placeholder with '
                         '%d dimension is not supported, at least '
                         '%d dimensions are needed.'
                         % (len(cntk_shape), dynamic_axis_num))

    if name is None:
        name = ''

    cntk_shape = cntk_shape[dynamic_axis_num:]

    x = C.input(
        shape=cntk_shape,
        dtype=_convert_string_dtype(dtype),
        is_sparse=sparse,
        name=name)
    x._keras_shape = shape
    x._uses_learning_phase = False
    x._cntk_placeholder = True
    return x


def is_placeholder(x):
    """Returns whether `x` is a placeholder.

    # Arguments
        x: A candidate placeholder.

    # Returns
        Boolean.
    """
    return hasattr(x, '_cntk_placeholder') and x._cntk_placeholder


def is_keras_tensor(x):
    if not is_tensor(x):
        raise ValueError('Unexpectedly found an instance of type `' +
                         str(type(x)) + '`. '
                         'Expected a symbolic tensor instance.')
    return hasattr(x, '_keras_history')


def is_tensor(x):
    return isinstance(x, (C.variables.Constant,
                          C.variables.Variable,
                          C.variables.Parameter,
                          C.ops.functions.Function))


def shape(x):
    shape = list(int_shape(x))
    num_dynamic = _get_dynamic_axis_num(x)
    non_dyn_shape = []
    for i in range(len(x.shape)):
        if shape[i + num_dynamic] is None:
            non_dyn_shape.append(x.shape[i])
        else:
            non_dyn_shape.append(shape[i + num_dynamic])
    return shape[:num_dynamic] + non_dyn_shape


def is_sparse(tensor):
    return tensor.is_sparse


def int_shape(x):
    if hasattr(x, '_keras_shape'):
        return x._keras_shape

    if hasattr(x, 'shape'):
        shape = x.shape
    else:
        shape = np.array(x).shape

    if hasattr(x, 'dynamic_axes'):
        dynamic_shape = [None for a in x.dynamic_axes]
        shape = tuple(dynamic_shape) + shape
    return shape


def ndim(x):
    shape = int_shape(x)
    return len(shape)


def _prepare_name(name, default):
    prefix = '_'.join(NAME_SCOPE_STACK)
    if name is None or name == '':
        return prefix + '/' + default
    return prefix + '/' + name


def constant(value, dtype=None, shape=None, name=None):
    if dtype is None:
        dtype = floatx()
    if shape is None:
        shape = ()
    np_value = value * np.ones(shape)
    const = C.constant(np_value,
                       dtype=dtype,
                       name=_prepare_name(name, 'constant'))
    const._keras_shape = const.shape
    const._uses_learning_phase = False
    return const


def random_binomial(shape, p=0.0, dtype=None, seed=None):
    if seed is None:
        # ensure that randomness is conditioned by the Numpy RNG
        seed = np.random.randint(10e7)
    if dtype is None:
        dtype = floatx()
    else:
        dtype = _convert_string_dtype(dtype)

    for _ in shape:
        if _ is None:
            raise ValueError('CNTK Backend: randomness op with '
                             'dynamic shape is not supported now. '
                             'Please provide fixed dimension '
                             'instead of `None`.')
    return C.random.bernoulli(shape=shape, dtype=dtype, mean=p, seed=seed)


def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
    for _ in shape:
        if _ is None:
            raise ValueError('CNTK Backend: randomness op with '
                             'dynamic shape is not supported now. '
                             'Please provide fixed dimension '
                             'instead of `None`.')

    if seed is None:
        # ensure that randomness is conditioned by the Numpy RNG
        seed = np.random.randint(10e3)
    return C.random.uniform(
        shape=shape,
        dtype=dtype,
        low=minval,
        high=maxval,
        seed=seed)


def random_uniform_variable(shape, low, high,
                            dtype=None, name=None, seed=None):
    if seed is None:
        # ensure that randomness is conditioned by the Numpy RNG
        seed = np.random.randint(10e3)

    if dtype is None:
        dtype = floatx()
    else:
        dtype = _convert_string_dtype(dtype)

    if name is None:
        name = ''

    scale = (high - low) / 2
    p = C.parameter(
        shape,
        init=C.initializer.uniform(
            scale,
            seed=seed),
        dtype=dtype,
        name=name)
    return variable(value=p.value + low + scale)


def random_normal_variable(
        shape,
        mean,
        scale,
        dtype=None,
        name=None,
        seed=None):
    if seed is None:
        # ensure that randomness is conditioned by the Numpy RNG
        seed = np.random.randint(10e7)
    if dtype is None:
        dtype = floatx()
    else:
        dtype = _convert_string_dtype(dtype)

    if name is None:
        name = ''

    p = C.parameter(
        shape=shape,
        init=C.initializer.normal(
            scale=scale,
            seed=seed),
        dtype=dtype,
        name=name)
    return variable(value=p.value + mean)


def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
    if dtype is None:
        dtype = floatx()
    for _ in shape:
        if _ is None:
            raise ValueError('CNTK Backend: randomness op with '
                             'dynamic shape is not supported now. '
                             'Please provide fixed dimension '
                             'instead of `None`.')
    if seed is None:
        # ensure that randomness is conditioned by the Numpy RNG
        seed = np.random.randint(10e3)
    return C.random.normal(
        shape=shape, mean=mean,
        scale=stddev, seed=seed,
        dtype=dtype)


def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
    if seed is None:
        seed = np.random.randint(1, 10e6)
    if dtype is None:
        dtype = floatx()
    else:
        dtype = _convert_string_dtype(dtype)

    return C.parameter(
        shape, init=C.initializer.truncated_normal(
            stddev, seed=seed), dtype=dtype)


def dtype(x):
    return _convert_dtype_string(x.dtype)


def zeros(shape, dtype=None, name=None):
    if dtype is None:
        dtype = floatx()
    ctype = _convert_string_dtype(dtype)
    return variable(value=np.zeros(shape, ctype), dtype=dtype, name=name)


def ones(shape, dtype=None, name=None):
    if dtype is None:
        dtype = floatx()
    ctype = _convert_string_dtype(dtype)
    return variable(value=np.ones(shape, ctype), dtype=dtype, name=name)


def eye(size, dtype=None, name=None):
    if dtype is None:
        dtype = floatx()
    if isinstance(size, (list, tuple)):
        n, m = size
    else:
        n, m = size, size
    return variable(np.eye(n, m), dtype, name)


def zeros_like(x, dtype=None, name=None):
    name = name or ''
    if dtype is None:
        dtype = floatx()
    return C.cast(C.zeros_like(x, name), dtype)


def ones_like(x, dtype=None, name=None):
    name = name or ''
    if dtype is None:
        dtype = floatx()
    return C.cast(C.ones_like(x, name), dtype)


def count_params(x):
    for _ in x.shape:
        if _ == C.InferredDimension or _ == C.FreeDimension:
            raise ValueError('CNTK backend: `count_params` with dynamic '
                             'shape is not supported. Please provide '
                             'fixed dimension instead of `None`.')

    return np.prod(int_shape(x))


def cast(x, dtype):
    # cntk calculate everything in float, so don't need case from bool / int
    return x


def size(x, name=None):
    return sum(ones_like(x, name=name))


def dot(x, y):
    if len(x.shape) > 2 or len(y.shape) > 2:
        y_shape = int_shape(y)
        if len(y_shape) > 2:
            permutation = [len(y_shape) - 2]
            permutation += list(range(len(y_shape) - 2))
            permutation += [len(y_shape) - 1]
            y = C.transpose(y, perm=permutation)
        return C.times(x, y, len(y_shape) - 1)
    else:
        return C.times(x, y)


def batch_dot(x, y, axes=None):
    x_shape = int_shape(x)
    y_shape = int_shape(y)

    x_ndim = len(x_shape)
    y_ndim = len(y_shape)

    if x_ndim < 2 or y_ndim < 2:
        raise ValueError('Can not do batch_dot on inputs '
                         'with rank < 2. '
                         'Received inputs with shapes ' +
                         str(x_shape) + ' and ' +
                         str(y_shape) + '.')

    x_batch_size = x_shape[0]
    y_batch_size = y_shape[0]

    if x_batch_size is not None and y_batch_size is not None:
        if x_batch_size != y_batch_size:
            raise ValueError('Can not do batch_dot on inputs '
                             'with different batch sizes. '
                             'Received inputs with shapes ' +
                             str(x_shape) + ' and ' +
                             str(y_shape) + '.')

    if isinstance(axes, int):
        axes = [axes, axes]

    if axes is None:
        if y_ndim == 2:
            axes = [x_ndim - 1, y_ndim - 1]
        else:
            axes = [x_ndim - 1, y_ndim - 2]

    if b_any([isinstance(a, (list, tuple)) for a in axes]):
        raise ValueError('Multiple target dimensions are not supported. ' +
                         'Expected: None, int, (int, int), ' +
                         'Provided: ' + str(axes))

    # if tuple, convert to list
    axes = list(axes)

    # convert negative indices
    if axes[0] < 0:
        axes[0] += x_ndim
    if axes[1] < 0:
        axes[1] += y_ndim

    if 0 in axes:
        raise ValueError('Can not perform batch_dot over axis 0.'
                         ' If your inputs are not batched,'
                         ' add a dummy batch dimension to your '
                         'inputs using K.expand_dims(x, 0)')
    d1 = x_shape[axes[0]]
    d2 = y_shape[axes[1]]

    if d1 is not None and d2 is not None and d1 != d2:
        raise ValueError('Can not do batch_dot on inputs with shapes ' +
                         str(x_shape) + ' and ' + str(y_shape) +
                         ' with axes=' + str(axes) + '. x.shape[%d] != '
                         'y.shape[%d] (%d != %d).' % (axes[0], axes[1], d1, d2))

    # Input shapes:
    # x: (b_size, x1, ..., d, ..., xn)
    # y: (b_size, y1, ..., d, ..., yn)
    # where d is the dimension to reduce.

    # Bring d to the last dimension in x
    # x: (b_size, ..., d)

    permute_pattern = list(range(x_ndim))
    for i in range(axes[0], x_ndim - 1):
        permute_pattern[i] = permute_pattern[i + 1]
    permute_pattern[-1] = axes[0]

    x = permute_dimensions(x, permute_pattern)

    # Bring d to the second dimension in y
    # y: (b_size, d, ...)
    permute_pattern = list(range(y_ndim))

    for i in range(axes[1], 1, -1):
        permute_pattern[i] = permute_pattern[i - 1]
    permute_pattern[1] = axes[1]
    y = permute_dimensions(y, permute_pattern)

    # Expand to rank 3 if needed
    if x_ndim == 2:
        x = expand_dims(x, 1)
        x_expanded = True
    else:
        x_expanded = False

    if y_ndim == 2:
        y = expand_dims(y, -1)
        y_expanded = True
    else:
        y_expanded = False

    x_shape = int_shape(x)
    y_shape = int_shape(y)

    # batch size might be lost at this point
    x_batch_size = x_shape[0]
    y_batch_size = y_shape[0]

    if x_batch_size is None and y_batch_size is None:
        dynamic_batch_size = True
    elif x_batch_size is not None and y_batch_size is not None:
        dynamic_batch_size = False
    else:
        raise ValueError('Can not perform batch_dot on inputs' +
                         ' with both static and dynamic batch sizes.' +
                         'You probably attempted to permform the ' +
                         'operation on a placeholder and a variable, ' +
                         'which is not yet supported on the CNTK backend.')

    if dynamic_batch_size:
        result = C.times(x, y, output_rank=y_ndim - 2 + int(y_expanded))
    else:
        result = []

        for i in range(x_batch_size):
            xi = x[i]
            yi = y[i]
            if ndim(xi) == ndim(x):  # for older versions of CNTK
                xi = squeeze(xi, 0)
                yi = squeeze(yi, 0)
            result.append(C.times(xi, yi, output_rank=y_ndim - 2 + int(y_expanded)))
        result = stack(result, 0)

    if x_expanded:
        result = squeeze(result, 1)

    if y_expanded:
        result = squeeze(result, -1)

    if ndim(result) == 1:
        return expand_dims(result)
    return result


def transpose(x):
    return C.swapaxes(x, 0, 1)


def gather(reference, indices):
    # There is a bug in cntk gather op which may cause crash.
    # We have made a fix but not catched in CNTK 2.1 release.
    # Will update with gather op in next release
    if _get_cntk_version() >= 2.2:
        return C.ops.gather(reference, indices)
    else:
        num_classes = reference.shape[0]
        one_hot_matrix = C.ops.one_hot(indices, num_classes)
        return C.times(
            one_hot_matrix, reference,
            output_rank=len(reference.shape) - 1)


def _remove_dims(x, axis, keepdims=False):
    if keepdims is False and isinstance(axis, list):
        # sequence axis is removed by default, so don't need reshape on it
        reduce_axes = []
        for a in axis:
            if isinstance(a, C.Axis) is False:
                reduce_axes.append(a)
        return _reshape_dummy_dim(x, reduce_axes)
    else:
        if isinstance(axis, list):
            has_seq = False
            for a in axis:
                if isinstance(a, C.Axis):
                    has_seq = True
                    break
            if has_seq:
                nones = _get_dynamic_axis_num(x)
                x = expand_dims(x, nones)
        return x


def max(x, axis=None, keepdims=False):
    axis = _normalize_axis(axis, x)
    output = _reduce_on_axis(x, axis, 'reduce_max')

    return _remove_dims(output, axis, keepdims)


def min(x, axis=None, keepdims=False):
    axis = _normalize_axis(axis, x)
    output = _reduce_on_axis(x, axis, 'reduce_min')

    return _remove_dims(output, axis, keepdims)


def sum(x, axis=None, keepdims=False):
    axis = _normalize_axis(axis, x)
    output = _reduce_on_axis(x, axis, 'reduce_sum')

    return _remove_dims(output, axis, keepdims)


def prod(x, axis=None, keepdims=False):
    axis = _normalize_axis(axis, x)
    output = _reduce_on_axis(x, axis, 'reduce_prod')

    return _remove_dims(output, axis, keepdims)


def logsumexp(x, axis=None, keepdims=False):
    return log(sum(exp(x), axis=axis, keepdims=keepdims))


def var(x, axis=None, keepdims=False):
    m = mean(x, axis, keepdims=True)
    devs_squared = C.square(x - m)
    return mean(devs_squared, axis=axis, keepdims=keepdims)


def std(x, axis=None, keepdims=False):
    return C.sqrt(var(x, axis=axis, keepdims=keepdims))


def expand_dims(x, axis=-1):
    shape = list(int_shape(x))
    nones = _get_dynamic_axis_num(x)
    index = axis if axis >= 0 else len(shape) + 1
    shape.insert(index, 1)
    new_shape = shape[nones:]
    new_shape = tuple(
        [C.InferredDimension if _ is None else _ for _ in new_shape])
    result = C.reshape(x, new_shape)
    if index < nones:
        result._keras_shape = shape
    return result


def squeeze(x, axis):
    if isinstance(axis, tuple):
        axis = list(axis)
    if not isinstance(axis, list):
        axis = [axis]

    shape = list(int_shape(x))

    _axis = []
    for _ in axis:
        if isinstance(_, int):
            _axis.append(_ if _ >= 0 else _ + len(shape))

    if len(_axis) == 0:
        return x

    nones = _get_dynamic_axis_num(x)
    for _ in sorted(_axis, reverse=True):
        del shape[_]

    new_shape = shape[nones:]

    new_shape_temp = []
    for _ in new_shape:
        if _ == C.FreeDimension:
            new_shape_temp.append(C.InferredDimension)
        else:
            new_shape_temp.append(_)

    new_shape = tuple(new_shape_temp)

    return C.reshape(x, new_shape)


def tile(x, n):
    if isinstance(n, int):
        n = (n,)
    elif isinstance(n, list):
        n = tuple(n)

    shape = int_shape(x)
    num_dynamic_axis = _get_dynamic_axis_num(x)
    if len(n) < len(shape):  # Padding the axis
        n = tuple([1 for _ in range(len(shape) - len(n))]) + n
    elif len(n) != len(shape):
        raise NotImplementedError

    i = num_dynamic_axis
    for i, rep in enumerate(n):
        if i >= num_dynamic_axis and shape[i] is not None:
            tmp = [x] * rep
            x = C.splice(*tmp, axis=i - num_dynamic_axis)
        i += 1

    return x


def _normalize_axis(axis, x):
    shape = int_shape(x)
    ndim = len(shape)

    nones = _get_dynamic_axis_num(x)

    if nones > ndim:
        raise ValueError(
            'CNTK Backend: tensor with keras shape: `%s` has '
            '%d cntk dynamic axis, this is not expected, please '
            'double check the keras shape history.'
            % (str(shape), nones))

    # Current cntk does not support shape like (1, batch). so using the workaround
    # here to mapping the correct axis. Will remove this tricky after we add support
    # in native cntk op
    cntk_axis = []
    dynamic_axis_index = 0
    for i in range(ndim):
        if shape[i] is None and dynamic_axis_index < nones:
            cntk_axis.append(x.dynamic_axes[dynamic_axis_index])
            dynamic_axis_index += 1
        else:
            cntk_axis.append(i - dynamic_axis_index)

    if dynamic_axis_index < nones:
        i = 0
        while dynamic_axis_index < nones:
            cntk_axis[i] = x.dynamic_axes[dynamic_axis_index]
            i += 1
            dynamic_axis_index += 1

        while i < len(cntk_axis):
            cntk_axis[i] -= nones
            i += 1

    if isinstance(axis, tuple):
        _axis = list(axis)
    elif isinstance(axis, int):
        _axis = [axis]
    elif isinstance(axis, list):
        _axis = list(axis)
    else:
        _axis = axis

    if isinstance(_axis, list):
        for i, a in enumerate(_axis):
            if a is not None and a < 0:
                _axis[i] = (a % ndim)
            if _axis[i] is not None:
                _axis[i] = cntk_axis[_axis[i]]
    else:
        if _axis is None:
            _axis = C.Axis.all_axes()

    return _axis


def _reshape_dummy_dim(x, axis):
    shape = list(x.shape)

    _axis = [_ + len(shape) if _ < 0 else _ for _ in axis]

    if shape.count(C.InferredDimension) > 1 or shape.count(C.FreeDimension) > 1:
        result = x
        for index in sorted(_axis, reverse=True):
            result = C.reshape(result,
                               shape=(),
                               begin_axis=index,
                               end_axis=index + 1)
        return result
    else:
        for index in sorted(_axis, reverse=True):
            del shape[index]

        shape = [C.InferredDimension if _ == C.FreeDimension else _ for _ in shape]
        return C.reshape(x, shape)


def mean(x, axis=None, keepdims=False):
    axis = _normalize_axis(axis, x)
    output = _reduce_on_axis(x, axis, 'reduce_mean')

    return _remove_dims(output, axis, keepdims)


def any(x, axis=None, keepdims=False):
    reduce_result = sum(x, axis, keepdims=keepdims)
    any_matrix = C.element_select(
        reduce_result,
        ones_like(reduce_result),
        zeros_like(reduce_result))
    if len(reduce_result.shape) == 0 and _get_dynamic_axis_num(x) == 0:
        return C.reduce_sum(any_matrix)
    else:
        return any_matrix


def all(x, axis=None, keepdims=False):
    reduce_result = prod(x, axis, keepdims=keepdims)
    all_matrix = C.element_select(
        reduce_result,
        ones_like(reduce_result),
        zeros_like(reduce_result))
    if len(reduce_result.shape) == 0 and _get_dynamic_axis_num(x) == 0:
        return C.reduce_sum(all_matrix)
    else:
        return all_matrix


def classification_error(target, output, axis=-1):
    return C.ops.reduce_mean(
        C.equal(
            argmax(
                output,
                axis=-1),
            argmax(
                target,
                axis=-1)),
        axis=C.Axis.all_axes())


def argmax(x, axis=-1):
    axis = [axis]
    axis = _normalize_axis(axis, x)
    output = C.ops.argmax(x, axis=axis[0])
    return _reshape_dummy_dim(output, axis)


def argmin(x, axis=-1):
    axis = [axis]
    axis = _normalize_axis(axis, x)
    output = C.ops.argmin(x, axis=axis[0])
    return _reshape_dummy_dim(output, axis)


def square(x):
    return C.square(x)


def abs(x):
    return C.abs(x)


def sqrt(x):
    return C.sqrt(x)


def exp(x):
    return C.exp(x)


def log(x):
    return C.log(x)


def round(x):
    return C.round(x)


def sigmoid(x):
    return C.sigmoid(x)


def sign(x):
    return x / C.abs(x)


def pow(x, a):
    return C.pow(x, a)


def clip(x, min_value, max_value):
    if (isinstance(min_value, (int, float)) and
            isinstance(max_value, (int, float))):
        if max_value < min_value:
            max_value = min_value
    if min_value is None:
        min_value = -np.inf
    if max_value is None:
        max_value = np.inf
    return C.clip(x, min_value, max_value)


def binary_crossentropy(target, output, from_logits=False):
    if from_logits:
        output = C.sigmoid(output)
    output = C.clip(output, epsilon(), 1.0 - epsilon())
    output = -target * C.log(output) - (1.0 - target) * C.log(1.0 - output)
    return output


def get_variable_shape(x):
    return int_shape(x)


def update(x, new_x):
    return C.assign(x, new_x)


def moving_average_update(variable, value, momentum):
    return C.assign(variable, variable * momentum + value * (1. - momentum))


def update_add(x, increment):
    result = x + increment
    return C.assign(x, result)


def update_sub(x, decrement):
    result = x - decrement
    return C.assign(x, result)


def gradients(loss, variables):
    # cntk does not support gradients as symbolic op,
    # to hook up with keras model
    # we will return a constant as place holder, the cntk learner will apply
    # the gradient during training.
    global grad_parameter_dict
    if isinstance(variables, list) is False:
        variables = [variables]
    grads = []
    for v in variables:
        g = C.constant(0, shape=v.shape, name='keras_grad_placeholder')
        grads.append(g)
        grad_parameter_dict[g] = v
    return grads


def equal(x, y):
    return C.equal(x, y)


def not_equal(x, y):
    return C.not_equal(x, y)


def greater(x, y):
    return C.greater(x, y)


def greater_equal(x, y):
    return C.greater_equal(x, y)


def less(x, y):
    return C.less(x, y)


def less_equal(x, y):
    return C.less_equal(x, y)


def maximum(x, y):
    return C.element_max(x, y)


def minimum(x, y):
    return C.element_min(x, y)


def sin(x):
    return C.sin(x)


def cos(x):
    return C.cos(x)


def normalize_batch_in_training(x, gamma, beta,
                                reduction_axes, epsilon=1e-3):
    if gamma is None:
        if beta is None:
            gamma = ones_like(x)
        else:
            gamma = ones_like(beta)
    if beta is None:
        if gamma is None:
            beta = zeros_like(x)
        else:
            beta = zeros_like(gamma)

    mean, variant = _moments(x, _normalize_axis(reduction_axes, x))

    if sorted(reduction_axes) == list(range(ndim(x)))[:-1]:
        normalized = batch_normalization(
            x, mean, variant, beta, gamma, epsilon)
    else:
        # need broadcasting
        target_shape = []
        x_shape = int_shape(x)
        # skip the batch axis
        for axis in range(1, ndim(x)):
            if axis in reduction_axes:
                target_shape.append(1)
                if ndim(gamma) > axis:
                    gamma = C.reduce_mean(gamma, axis - 1)
                    beta = C.reduce_mean(beta, axis - 1)
            else:
                target_shape.append(x_shape[axis])

        broadcast_mean = C.reshape(mean, target_shape)
        broadcast_var = C.reshape(variant, target_shape)
        broadcast_gamma = C.reshape(gamma, target_shape)
        broadcast_beta = C.reshape(beta, target_shape)
        normalized = batch_normalization(
            x,
            broadcast_mean,
            broadcast_var,
            broadcast_beta,
            broadcast_gamma,
            epsilon)

    return normalized, mean, variant


def _moments(x, axes=None, shift=None, keep_dims=False):
    _axes = tuple(axes)
    if shift is None:
        shift = x
        # Compute true mean while keeping the dims for proper broadcasting.
        for axis in _axes:
            shift = C.reduce_mean(shift, axis=axis)

    shift = C.stop_gradient(shift)
    shifted_mean = C.minus(x, shift)
    for axis in _axes:
        shifted_mean = C.reduce_mean(shifted_mean, axis=axis)

    variance_mean = C.square(C.minus(x, shift))
    for axis in _axes:
        variance_mean = C.reduce_mean(variance_mean, axis=axis)

    variance = C.minus(variance_mean, C.square(shifted_mean))
    mean = C.plus(shifted_mean, shift)

    if not keep_dims:
        mean = squeeze(mean, _axes)
        variance = squeeze(variance, _axes)

    return mean, variance


def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
    # The mean / var / beta / gamma may be processed by broadcast
    # so it may have an extra batch axis with 1, it is not needed
    # in cntk, need to remove those dummy axis.
    if ndim(mean) == ndim(x) and shape(mean)[0] == 1:
        mean = _reshape_dummy_dim(mean, [0])
    if ndim(var) == ndim(x) and shape(var)[0] == 1:
        var = _reshape_dummy_dim(var, [0])

    if gamma is None:
        gamma = ones_like(var)
    elif ndim(gamma) == ndim(x) and shape(gamma)[0] == 1:
        gamma = _reshape_dummy_dim(gamma, [0])

    if beta is None:
        beta = zeros_like(mean)
    elif ndim(beta) == ndim(x) and shape(beta)[0] == 1:
        beta = _reshape_dummy_dim(beta, [0])

    return (x - mean) / C.sqrt(var + epsilon) * gamma + beta


def concatenate(tensors, axis=-1):
    if len(tensors) == 0:
        return None

    axis = [axis]
    axis = _normalize_axis(axis, tensors[0])
    return C.splice(*tensors, axis=axis[0])


def stack(x, axis=0):
    x = [expand_dims(t, axis) for t in x]
    return concatenate(x, axis)


def flatten(x):
    return reshape(x, (-1,))


def reshape(x, shape):
    shape_temp = []
    for _ in shape:
        if _ == C.FreeDimension:
            shape_temp.append(C.InferredDimension)
        else:
            shape_temp.append(_)

    shape = tuple(shape_temp)

    if isinstance(x, C.variables.Parameter):
        return C.reshape(x, shape)
    else:
        num_dynamic_axis = _get_dynamic_axis_num(x)

        if num_dynamic_axis == 1 and len(shape) > 0 and shape[0] == -1:
            # collapse axis with batch axis
            if b_any(_ == C.InferredDimension for _ in x.shape) or b_any(
                    _ == C.FreeDimension for _ in x.shape):
                warnings.warn(
                    'Warning: CNTK backend does not support '
                    'collapse of batch axis with inferred dimension. '
                    'The reshape did not take place.')
                return x
            return _reshape_batch(x, shape)
        else:
            # no collapse, then first need to padding the shape
            if num_dynamic_axis >= len(shape):
                i = 0
                while i < len(shape):
                    if shape[i] is None or shape[i] == -1:
                        i += 1
                    else:
                        break
                shape = tuple([-1 for _ in range(num_dynamic_axis - i)]) + shape

            new_shape = list(shape)
            new_shape = new_shape[num_dynamic_axis:]
            new_shape = [C.InferredDimension if _ is None else _ for _ in new_shape]
            return C.reshape(x, new_shape)


def permute_dimensions(x, pattern):
    dims = len(int_shape(x))
    num_dynamic_axis = _get_dynamic_axis_num(x)
    if isinstance(pattern, list):
        current_layout = [i for i in range(dims)]
    else:
        current_layout = tuple([i for i in range(dims)])

    if (num_dynamic_axis > 0 and
            pattern[:num_dynamic_axis] != current_layout[:num_dynamic_axis]):
                raise ValueError('CNTK backend: the permute pattern %s '
                                 'requested permute on dynamic axis, '
                                 'which is not supported. Please do permute '
                                 'on static axis.' % pattern)

    axis = list(pattern)
    axis = axis[num_dynamic_axis:]
    axis = _normalize_axis(axis, x)
    return C.transpose(x, axis)


def resize_images(x, height_factor, width_factor, data_format,
                  interpolation='nearest'):
    if interpolation == 'nearest':
        if data_format == 'channels_first':
            output = repeat_elements(x, height_factor, axis=2)
            output = repeat_elements(output, width_factor, axis=3)
            return output
        elif data_format == 'channels_last':
            output = repeat_elements(x, height_factor, axis=1)
            output = repeat_elements(output, width_factor, axis=2)
            return output
        else:
            raise ValueError('CNTK Backend: Invalid data_format: %s' % data_format)
    else:
        raise NotImplementedError('CNTK only supports `nearest` interpolation.')


def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
    if data_format == 'channels_first':
        output = repeat_elements(x, depth_factor, axis=2)
        output = repeat_elements(output, height_factor, axis=3)
        output = repeat_elements(output, width_factor, axis=4)
        return output
    elif data_format == 'channels_last':
        output = repeat_elements(x, depth_factor, axis=1)
        output = repeat_elements(output, height_factor, axis=2)
        output = repeat_elements(output, width_factor, axis=3)
        return output
    else:
        raise ValueError('CNTK Backend: Invalid data_format: %s' % data_format)


def repeat_elements(x, rep, axis):
    axis = _normalize_axis(axis, x)
    axis = axis[0]
    slices = []
    shape = x.shape
    i = 0
    while i < shape[axis]:
        tmp = C.ops.slice(x, axis, i, i + 1)
        for _ in range(rep):
            slices.append(tmp)
        i += 1
    return C.splice(*slices, axis=axis)


def repeat(x, n):
    # this is a workaround for recurrent layer
    # if n is inferred dimension,
    # we can't figure out how to repeat it in cntk now
    # return the same x to take cntk broadcast feature
    # to make the recurrent layer work.
    # need to be fixed in GA.
    if n is C.InferredDimension or n is C.FreeDimension:
        return x
    index = 1 - _get_dynamic_axis_num(x)
    if index < 0 or index > 1:
        raise NotImplementedError

    new_shape = list(x.shape)
    new_shape.insert(index, 1)
    new_shape = tuple(new_shape)
    x = C.reshape(x, new_shape)
    temp = [x] * n
    return C.splice(*temp, axis=index)


def tanh(x):
    return C.tanh(x)


def _static_rnn(step_function, inputs, initial_states,
                go_backwards=False, mask=None, constants=None,
                unroll=False, input_length=None):

    shape = int_shape(inputs)
    dims = len(shape)

    uses_learning_phase = False

    if dims < 3:
        raise ValueError('Input should be at least 3D.')

    # if the second axis is static axis, CNTK will do unroll by default
    if shape[1] is None:
        raise ValueError('CNTK Backend: the input of static rnn '
                         'has shape `%s`, the second axis '
                         'is not static. If you want to run '
                         'rnn with non-static axis, please try '
                         'dynamic rnn with sequence axis.' % shape)
    if constants is None:
        constants = []

    if mask is not None:
        mask_shape = int_shape(mask)
        if len(mask_shape) == dims - 1:
            mask = expand_dims(mask)

    nones = _get_dynamic_axis_num(inputs)

    states = tuple(initial_states)

    outputs = []

    time_axis = 1 - nones if nones > 0 else 1

    if go_backwards:
        i = shape[1] - 1
        while i >= 0:
            current = C.ops.slice(inputs, time_axis, i, i + 1)
            # remove dummy dimension
            current = squeeze(current, time_axis)

            output, new_states = step_function(
                current, tuple(states) + tuple(constants))
            if getattr(output, '_uses_learning_phase', False):
                uses_learning_phase = True

            if mask is not None:
                mask_slice = C.ops.slice(mask, time_axis, i, i + 1)
                mask_slice = squeeze(mask_slice, time_axis)
                if len(outputs) == 0:
                    prev_output = zeros_like(output)
                else:
                    prev_output = outputs[-1]
                output = C.ops.element_select(mask_slice, output, prev_output)

                return_states = []
                for s, n_s in zip(states, new_states):
                    return_states.append(
                        C.ops.element_select(
                            mask_slice, n_s, s))
                new_states = return_states
            outputs.append(output)
            states = new_states
            i -= 1
    else:
        i = 0
        while i < shape[1]:
            current = C.ops.slice(inputs, time_axis, i, i + 1)
            # remove dummy dimension
            current = squeeze(current, 1)

            output, new_states = step_function(
                current, tuple(states) + tuple(constants))
            if getattr(output, '_uses_learning_phase', False):
                uses_learning_phase = True

            if mask is not None:
                mask_slice = C.ops.slice(mask, time_axis, i, i + 1)
                mask_slice = squeeze(mask_slice, 1)
                if len(outputs) == 0:
                    prev_output = zeros_like(output)
                else:
                    prev_output = outputs[-1]
                output = C.ops.element_select(mask_slice, output, prev_output)

                return_states = []
                for s, n_s in zip(states, new_states):
                    return_states.append(
                        C.ops.element_select(
                            mask_slice, n_s, s))
                new_states = return_states
            outputs.append(output)
            states = new_states[:len(states)]
            i += 1

    i = 1
    # add the time_step axis back
    final_output = expand_dims(outputs[0], 1)
    last_output = outputs[0]
    while i < len(outputs):
        # add the time_step axis back
        output_slice = expand_dims(outputs[i], 1)
        final_output = C.splice(final_output, output_slice, axis=time_axis)
        last_output = outputs[i]
        i += 1

    last_output._uses_learning_phase = uses_learning_phase
    return last_output, final_output, states


def rnn(step_function, inputs, initial_states,
        go_backwards=False, mask=None, constants=None,
        unroll=False, input_length=None):

    if not unroll and mask is not None:
        warnings.warn(
            'CNTK Backend only supports accurate masking if '
            '`output == new_states[0]` for '
            '`output, new_states = step_function(inputs, states)`')

    shape = int_shape(inputs)
    dims = len(shape)

    global uses_learning_phase
    uses_learning_phase = False

    if dims < 3:
        raise ValueError('CNTK Backend: the input of rnn has only rank %d '
                         'Need at least rank 3 to run RNN.' % dims)

    if _get_dynamic_axis_num(inputs) == 0 or unroll:
        return _static_rnn(
            step_function,
            inputs,
            initial_states,
            go_backwards,
            mask,
            constants,
            unroll,
            input_length)

    if constants is None:
        constants = []

    num_time_step = shape[1]
    if num_time_step is None and not has_seq_axis(inputs):
        num_time_step = inputs.shape[0]

    initial = []
    for s in initial_states:
        if _get_dynamic_axis_num(s) == 0:
            if hasattr(C, 'to_batch'):
                initial.append(C.to_batch(s))
            else:
                initial.append(C.user_function(ConvertToBatch(s)))
        else:
            initial.append(s)

    need_convert = not has_seq_axis(inputs)
    if go_backwards and need_convert is False:
        raise NotImplementedError(
            'CNTK Backend: `go_backwards` is not supported with '
            'variable-length sequences. Please specify a '
            'static length for your sequences.')

    rnn_inputs = inputs
    if need_convert:
        if go_backwards:
            rnn_inputs = reverse(rnn_inputs, 1)

        rnn_inputs = C.to_sequence(rnn_inputs)

        rnn_constants = []
        for constant in constants:
            if isinstance(constant, list):
                new_c = []
                for c in constant:
                    if _get_dynamic_axis_num(c) == 1:
                        new_c.append(C.sequence.broadcast_as(c, rnn_inputs))
                    else:
                        new_c.append(c)
                rnn_constants.append(new_c)
            else:
                if _get_dynamic_axis_num(constant) == 1:
                    rnn_constants.append(C.sequence.broadcast_as(
                        constant,
                        rnn_inputs))
                else:
                    rnn_constants.append(constant)
    else:
        rnn_constants = constants

    if mask is not None and not has_seq_axis(mask):
        if go_backwards:
            mask = reverse(mask, 1)
        if len(int_shape(mask)) == 2:
            mask = expand_dims(mask)
        mask = C.to_sequence_like(mask, rnn_inputs)

    states = tuple(initial)

    with C.default_options(axis_offset=1):
        def _recurrence(x, states, m):
            # create place holder
            place_holders = [C.placeholder(
                dynamic_axes=x.dynamic_axes) for _ in states]
            past_values = []
            for s, p in zip(states, place_holders):
                past_values.append(C.sequence.past_value(p, s))
            new_output, new_states = step_function(
                x, tuple(past_values) + tuple(rnn_constants))

            if getattr(new_output, '_uses_learning_phase', False):
                global uses_learning_phase
                uses_learning_phase = True

            if m is not None:
                new_states_temp = []
                for n, s in zip(new_states, past_values):
                    new_states_temp.append(C.element_select(m, n, s))

                new_states = new_states_temp

            n_s = []
            for o, p in zip(new_states, place_holders):
                n_s.append(o.replace_placeholders({p: o.output}))
            if len(n_s) > 0:
                new_output = n_s[-1]
            return new_output, n_s

        final_output, final_states = _recurrence(rnn_inputs, states, mask)
        last_output = C.sequence.last(final_output)
        last_states = [C.sequence.last(s) for s in final_states]

    if need_convert:
        final_output = C.sequence.unpack(final_output, 0, no_mask_output=True)
        if num_time_step is not None and num_time_step is not C.FreeDimension:
            final_output = _reshape_sequence(final_output, num_time_step)

    f_stats = []
    for l_s, i_s in zip(last_states, initial_states):
        if _get_dynamic_axis_num(i_s) == 0 and _get_dynamic_axis_num(l_s) == 1:
            if hasattr(C, 'unpack_batch'):
                f_stats.append(C.unpack_batch(l_s))
            else:
                f_stats.append(
                    C.user_function(ConvertToStatic(l_s, batch_size=i_s.shape[0])))
        else:
            f_stats.append(l_s)

    last_output._uses_learning_phase = uses_learning_phase
    return last_output, final_output, f_stats


def has_seq_axis(x):
    return hasattr(x, 'dynamic_axes') and len(x.dynamic_axes) > 1


def l2_normalize(x, axis=None):
    axis = [axis]
    axis = _normalize_axis(axis, x)
    norm = C.sqrt(C.reduce_sum(C.square(x), axis=axis[0]))
    return x / norm


def hard_sigmoid(x):
    x = (0.2 * x) + 0.5
    x = C.clip(x, 0.0, 1.0)
    return x


def conv1d(x, kernel, strides=1, padding='valid',
           data_format=None, dilation_rate=1):
    data_format = normalize_data_format(data_format)

    if padding == 'causal':
        # causal (dilated) convolution:
        left_pad = dilation_rate * (kernel.shape[0] - 1)
        x = temporal_padding(x, (left_pad, 0))
        padding = 'valid'

    if data_format == 'channels_last':
        x = C.swapaxes(x, 0, 1)

    # As of Keras 2.0.0, all kernels are normalized
    # on the format `(steps, input_depth, depth)`,
    # independently of `data_format`.
    # CNTK expects `(depth, input_depth, steps)`.
    kernel = C.swapaxes(kernel, 0, 2)

    padding = _preprocess_border_mode(padding)

    if dev.type() == 0 and dilation_rate != 1:
        raise ValueError(
            'Dilated convolution on CPU is not supported by CNTK backend. '
            'Please set `dilation_rate` to 1. You passed: %s' % (dilation_rate,))

    dilation_rate = (1, dilation_rate)

    x = C.convolution(
        kernel,
        x,
        strides=strides,
        auto_padding=[False, padding],
        dilation=dilation_rate)

    if data_format == 'channels_last':
        x = C.swapaxes(x, 0, 1)
    return x


def conv2d(x, kernel, strides=(1, 1), padding='valid',
           data_format=None, dilation_rate=(1, 1)):
    data_format = normalize_data_format(data_format)

    x = _preprocess_conv2d_input(x, data_format)
    kernel = _preprocess_conv2d_kernel(kernel, data_format)
    padding = _preprocess_border_mode(padding)

    if dev.type() == 0 and dilation_rate != (1, 1):
        raise ValueError(
            'Dilated convolution on CPU is not supported by CNTK backend. '
            'Please set `dilation_rate` to (1, 1). '
            'You passed: %s' % (dilation_rate,))

    dilation_rate = (1,) + dilation_rate

    x = C.convolution(kernel,
                      x,
                      strides,
                      auto_padding=[False, padding, padding],
                      dilation=dilation_rate)

    return _postprocess_conv2d_output(x, data_format)


def separable_conv1d(x, depthwise_kernel, pointwise_kernel, strides=1,
                     padding='valid', data_format=None, dilation_rate=1):
    data_format = normalize_data_format(data_format)
    if isinstance(strides, int):
        strides = (strides,)
    if isinstance(dilation_rate, int):
        dilation_rate = (dilation_rate,)

    if dilation_rate != (1,):
        raise ValueError(
            'Dilated separable 1D convolution is currently not supported '
            'by CNTK backend. Please set `dilation_rate` to 1. '
            'You passed: %s' % (dilation_rate,))

    if data_format == 'channels_last':
        spatial_start_dim = 2
    else:
        spatial_start_dim = 3
    x = expand_dims(x, spatial_start_dim)
    depthwise_kernel = expand_dims(depthwise_kernel, 1)
    pointwise_kernel = expand_dims(pointwise_kernel, 1)
    strides = (1,) + strides + (1,)
    dilation_rate = (1,) + dilation_rate

    x = _preprocess_conv2d_input(x, data_format)
    depthwise_kernel = _preprocess_conv2d_kernel(depthwise_kernel, data_format)
    depthwise_kernel = C.reshape(C.transpose(depthwise_kernel, (1, 0, 2, 3)),
                                 (-1, 1) + depthwise_kernel.shape[2:])
    pointwise_kernel = _preprocess_conv2d_kernel(pointwise_kernel, data_format)
    padding = _preprocess_border_mode(padding)

    x = C.convolution(depthwise_kernel, x,
                      strides=strides,
                      auto_padding=[False, padding, padding],
                      groups=x.shape[0])
    x = C.convolution(pointwise_kernel, x,
                      strides=(1, 1, 1),
                      auto_padding=[False])

    x = _postprocess_conv2d_output(x, data_format)
    return squeeze(x, spatial_start_dim)


def separable_conv2d(x, depthwise_kernel, pointwise_kernel, strides=(1, 1),
                     padding='valid', data_format=None, dilation_rate=(1, 1)):
    data_format = normalize_data_format(data_format)

    x = _preprocess_conv2d_input(x, data_format)
    depthwise_kernel = _preprocess_conv2d_kernel(depthwise_kernel, data_format)
    depthwise_kernel = C.reshape(C.transpose(depthwise_kernel, (1, 0, 2, 3)),
                                 (-1, 1) + depthwise_kernel.shape[2:])
    pointwise_kernel = _preprocess_conv2d_kernel(pointwise_kernel, data_format)
    padding = _preprocess_border_mode(padding)

    if dilation_rate == (1, 1):
        strides = (1,) + strides
        x = C.convolution(depthwise_kernel, x,
                          strides=strides,
                          auto_padding=[False, padding, padding],
                          groups=x.shape[0])
        x = C.convolution(pointwise_kernel, x,
                          strides=(1, 1, 1),
                          auto_padding=[False])
    else:
        if dilation_rate[0] != dilation_rate[1]:
            raise ValueError('CNTK Backend: non-square dilation_rate is '
                             'not supported.')
        if strides != (1, 1):
            raise ValueError('Invalid strides for dilated convolution')
        x = C.convolution(depthwise_kernel, x,
                          strides=dilation_rate[0],
                          auto_padding=[False, padding, padding])
        x = C.convolution(pointwise_kernel, x,
                          strides=(1, 1, 1),
                          auto_padding=[False])
    return _postprocess_conv2d_output(x, data_format)


def depthwise_conv2d(x, depthwise_kernel, strides=(1, 1), padding='valid',
                     data_format=None, dilation_rate=(1, 1)):
    data_format = normalize_data_format(data_format)

    x = _preprocess_conv2d_input(x, data_format)
    depthwise_kernel = _preprocess_conv2d_kernel(depthwise_kernel, data_format)
    depthwise_kernel = C.reshape(C.transpose(depthwise_kernel, (1, 0, 2, 3)),
                                 (-1, 1) + depthwise_kernel.shape[2:])
    padding = _preprocess_border_mode(padding)
    if dilation_rate == (1, 1):
        strides = (1,) + strides
        x = C.convolution(depthwise_kernel, x,
                          strides=strides,
                          auto_padding=[False, padding, padding],
                          groups=x.shape[0])
    else:
        if dilation_rate[0] != dilation_rate[1]:
            raise ValueError('CNTK Backend: non-square dilation_rate is '
                             'not supported.')
        if strides != (1, 1):
            raise ValueError('Invalid strides for dilated convolution')
        x = C.convolution(depthwise_kernel, x,
                          strides=dilation_rate[0],
                          auto_padding=[False, padding, padding],
                          groups=x.shape[0])
    return _postprocess_conv2d_output(x, data_format)


def conv3d(x, kernel, strides=(1, 1, 1), padding='valid',
           data_format=None, dilation_rate=(1, 1, 1)):
    data_format = normalize_data_format(data_format)

    x = _preprocess_conv3d_input(x, data_format)
    kernel = _preprocess_conv3d_kernel(kernel, data_format)
    padding = _preprocess_border_mode(padding)

    if dev.type() == 0 and dilation_rate != (1, 1, 1):
        raise ValueError(
            'Dilated convolution on CPU is not supported by CNTK backend. '
            'Please set `dilation_rate` to (1, 1, 1). '
            'You passed: %s' % (dilation_rate,))

    dilation_rate = (1,) + dilation_rate

    x = C.convolution(
        kernel,
        x,
        strides,
        auto_padding=[False, padding, padding, padding],
        dilation=dilation_rate)

    return _postprocess_conv3d_output(x, data_format)


def conv3d_transpose(x, kernel, output_shape, strides=(1, 1, 1),
                     padding='valid', data_format=None):
    data_format = normalize_data_format(data_format)

    x = _preprocess_conv3d_input(x, data_format)
    kernel = _preprocess_conv3d_kernel(kernel, data_format)
    padding = _preprocess_border_mode(padding)
    strides = (1,) + strides
    # cntk output_shape does not include batch axis
    output_shape = output_shape[1:]
    # in keras2, need handle output shape in different format
    if data_format == 'channels_last':
        output_shape = transpose_shape(output_shape, 'channels_first',
                                       spatial_axes=(0, 1, 2))

    x = C.convolution_transpose(
        kernel,
        x,
        strides,
        auto_padding=[
            False,
            padding,
            padding,
            padding],
        output_shape=output_shape)
    return _postprocess_conv3d_output(x, data_format)


def pool2d(x, pool_size, strides=(1, 1),
           padding='valid', data_format=None,
           pool_mode='max'):
    data_format = normalize_data_format(data_format)

    padding = _preprocess_border_mode(padding)
    x = _preprocess_conv2d_input(x, data_format)
    if pool_mode == 'max':
        x = C.pooling(
            x,
            C.MAX_POOLING,
            pool_size,
            strides,
            auto_padding=[padding])
    elif pool_mode == 'avg':
        x = C.pooling(
            x,
            C.AVG_POOLING,
            pool_size,
            strides,
            auto_padding=[padding])
    else:
        raise ValueError('Invalid pooling mode: ' + str(pool_mode))
    return _postprocess_conv2d_output(x, data_format)


def pool3d(x, pool_size, strides=(1, 1, 1), padding='valid',
           data_format=None, pool_mode='max'):
    data_format = normalize_data_format(data_format)

    padding = _preprocess_border_mode(padding)

    x = _preprocess_conv3d_input(x, data_format)

    if pool_mode == 'max':
        x = C.pooling(
            x,
            C.MAX_POOLING,
            pool_size,
            strides,
            auto_padding=[padding])
    elif pool_mode == 'avg':
        x = C.pooling(
            x,
            C.AVG_POOLING,
            pool_size,
            strides,
            auto_padding=[padding])
    else:
        raise ValueError('Invalid pooling mode: ' + str(pool_mode))

    return _postprocess_conv3d_output(x, data_format)


def relu(x, alpha=0., max_value=None, threshold=0.):

    if alpha != 0.:
        if threshold != 0.:
            negative_part = C.relu(-x + threshold)
        else:
            negative_part = C.relu(-x)

    if threshold != 0.:
        x = x * C.greater(x, threshold)
    else:
        x = C.relu(x)

    if max_value is not None:
        x = C.clip(x, 0.0, max_value)

    if alpha != 0.:
        x -= alpha * negative_part

    return x


def dropout(x, level, noise_shape=None, seed=None):
    if level < 0. or level >= 1:
        raise ValueError('CNTK Backend: Invalid dropout level %s, '
                         'must be in interval [0, 1].' % level)
    return C.dropout(x, level)


def batch_flatten(x):
    # cntk's batch axis is not in shape,
    # so just flatten all the dim in x.shape
    dim = np.prod(x.shape)
    x = C.reshape(x, (-1,))
    x._keras_shape = (None, dim)
    return x


def softmax(x, axis=-1):
    return C.softmax(x, axis=axis)


def softplus(x):
    return C.softplus(x)


def softsign(x):
    return x / (1 + C.abs(x))


def categorical_crossentropy(target, output, from_logits=False, axis=-1):
    # Here, unlike other backends, the tensors lack a batch dimension:
    axis_without_batch = -1 if axis == -1 else axis - 1
    output_dimensions = list(range(len(output.shape)))
    if axis_without_batch != -1 and axis_without_batch not in output_dimensions:
        raise ValueError(
            '{}{}{}'.format(
                'Unexpected channels axis {}. '.format(axis_without_batch),
                'Expected to be -1 or one of the axes of `output`, ',
                'which has {} dimensions.'.format(len(output.shape))))
    # If the channels are not in the last axis, move them to be there:
    if axis_without_batch != -1 and axis_without_batch != output_dimensions[-1]:
        permutation = output_dimensions[:axis_without_batch]
        permutation += output_dimensions[axis_without_batch + 1:]
        permutation += [axis_without_batch]
        output = C.transpose(output, permutation)
        target = C.transpose(target, permutation)
    if from_logits:
        result = C.cross_entropy_with_softmax(output, target)
        # cntk's result shape is (batch, 1), while keras expect (batch, )
        return C.reshape(result, ())
    else:
        # scale preds so that the class probas of each sample sum to 1
        output /= C.reduce_sum(output, axis=-1)
        # avoid numerical instability with epsilon clipping
        output = C.clip(output, epsilon(), 1.0 - epsilon())
        return -sum(target * C.log(output), axis=-1)


def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
    # Here, unlike other backends, the tensors lack a batch dimension:
    axis_without_batch = -1 if axis == -1 else axis - 1
    output_dimensions = list(range(len(output.shape)))
    if axis_without_batch != -1 and axis_without_batch not in output_dimensions:
        raise ValueError(
            '{}{}{}'.format(
                'Unexpected channels axis {}. '.format(axis_without_batch),
                'Expected to be -1 or one of the axes of `output`, ',
                'which has {} dimensions.'.format(len(output.shape))))
    target = C.one_hot(target, output.shape[axis_without_batch],
                       axis=axis_without_batch)
    target = C.reshape(target, output.shape)
    return categorical_crossentropy(target, output, from_logits, axis=axis)


class Function(object):

    def __init__(self, inputs, outputs, updates=[], **kwargs):
        self.placeholders = inputs
        self.trainer = None
        self.unrelated_updates = None
        self.updates = updates
        if len(updates) > 0:
            assert len(outputs) > 0
            self.loss = outputs[0]
            # need group update by gradient place holder
            u_ops = []
            unrelated_updates = []
            for update in updates:
                if isinstance(update, tuple):
                    if len(update) != 2:
                        raise NotImplementedError
                    else:
                        u = C.assign(update[0], update[1])
                else:
                    u = update

                if len(u.arguments) == 0:
                    u_ops.append(u)
                else:
                    unrelated_updates.append(u)

            update_func = C.combine([u.output for u in u_ops])

            grads = update_func.find_all_with_name('keras_grad_placeholder')

            u_list = []
            p_list = []
            for g in grads:
                if g in grad_parameter_dict:
                    p_list.append(grad_parameter_dict[g])
                    u_list.append(g)
                else:
                    raise ValueError(
                        'CNTK backend: when constructing trainer, '
                        'found gradient node `%s` which is not '
                        'related to any parameters in the model. '
                        'Please double check how the gradient node '
                        'is constructed.' % g)

            if len(u_list) > 0:
                learner = C.cntk_py.universal_learner(p_list, u_list, update_func)

                criterion = (
                    outputs[0],
                    outputs[1]) if len(outputs) > 1 else (
                    outputs[0],
                )
                self.trainer = C.trainer.Trainer(
                    outputs[0], criterion, [learner])
                self.trainer_output = tuple([f.output for f in criterion])
            elif len(u_ops) > 0:
                unrelated_updates.extend(u_ops)

            if len(unrelated_updates) > 0:
                self.unrelated_updates = C.combine(
                    [_.output for _ in unrelated_updates])

        if self.trainer is None:
            self.metrics_outputs = [f.output for f in outputs]
            self.metrics_func = C.combine(self.metrics_outputs)
        # cntk only could handle loss and 1 metric in trainer, for metrics more
        # than 2, need manual eval
        elif len(outputs) > 2:
            self.metrics_outputs = [f.output for f in outputs[2:]]
            self.metrics_func = C.combine(self.metrics_outputs)
        else:
            self.metrics_func = None

    @staticmethod
    def _is_input_shape_compatible(input, placeholder):
        if hasattr(input, 'shape') and hasattr(placeholder, 'shape'):
            num_dynamic = get_num_dynamic_axis(placeholder)
            input_shape = input.shape[num_dynamic:]
            placeholder_shape = placeholder.shape
            for i, p in zip(input_shape, placeholder_shape):
                if i != p and p != C.InferredDimension and p != C.FreeDimension:
                    return False
        return True

    def __call__(self, inputs):
        global _LEARNING_PHASE_PLACEHOLDER
        global _LEARNING_PHASE
        assert isinstance(inputs, (list, tuple))
        feed_dict = {}
        for tensor, value in zip(self.placeholders, inputs):
            # cntk only support calculate on float, do auto cast here
            if (hasattr(value, 'dtype') and
               value.dtype != np.float32 and
               value.dtype != np.float64):
                value = value.astype(np.float32)

            if tensor == _LEARNING_PHASE_PLACEHOLDER:
                _LEARNING_PHASE_PLACEHOLDER.value = np.asarray(value)
            else:
                # in current version cntk can't support input with variable
                # length. Will support it in next release.
                if not self._is_input_shape_compatible(value, tensor):
                    raise ValueError(
                        'CNTK backend: The placeholder has been resolved '
                        'to shape `%s`, but input shape is `%s`. Currently '
                        'CNTK can not take variable length inputs. Please '
                        'pass inputs that have a static shape.'
                        % (str(tensor.shape), str(value.shape)))
            feed_dict[tensor] = value

        updated = []
        if self.trainer is not None:
            input_dict = {}
            for argument in self.loss.arguments:
                if argument in feed_dict:
                    input_dict[argument] = feed_dict[argument]
                else:
                    raise ValueError(
                        'CNTK backend: argument %s is not found in inputs. '
                        'Please double check the model and inputs in '
                        '`train_function`.' % argument.name)

            result = self.trainer.train_minibatch(
                input_dict, self.trainer_output)

            assert(len(result) == 2)
            outputs = result[1]
            for o in self.trainer_output:
                updated.append(outputs[o])

        if self.metrics_func is not None:
            input_dict = {}
            for argument in self.metrics_func.arguments:
                if argument in feed_dict:
                    input_dict[argument] = feed_dict[argument]
                else:
                    raise ValueError('CNTK backend: metrics argument %s '
                                     'is not found in inputs. Please double '
                                     'check the model and inputs.' % argument.name)
            # Some ops (like dropout) won't be applied during "eval" in cntk.
            # They only evaluated in training phase. To make it work, call
            # "forward" method to let cntk know we want to evaluate them.from
            # But the assign ops won't be executed under this mode, that's why
            # we need this check.
            if (self.unrelated_updates is None and
                    (_LEARNING_PHASE_PLACEHOLDER.value == 1.0 or
                        _LEARNING_PHASE == 1)):
                _, output_values = self.metrics_func.forward(
                    input_dict,
                    self.metrics_func.outputs,
                    (self.metrics_func.outputs[0],),
                    as_numpy=False)
            else:
                output_values = self.metrics_func.eval(input_dict, as_numpy=False)
            if isinstance(output_values, dict):
                for o in self.metrics_outputs:
                    value = output_values[o]
                    v = value.asarray()
                    updated.append(v)
            else:
                v = output_values.asarray()
                for o in self.metrics_outputs:
                    updated.append(v)

        if self.unrelated_updates is not None:
            input_dict = {}
            for argument in self.unrelated_updates.arguments:
                if argument in feed_dict:
                    input_dict[argument] = feed_dict[argument]
                else:
                    raise ValueError(
                        'CNTK backend: assign ops argument %s '
                        'is not found in inputs. Please double '
                        'check the model and inputs.' % argument.name)
            self.unrelated_updates.eval(input_dict, as_numpy=False)
        return updated


def function(inputs, outputs, updates=[], **kwargs):
    return Function(inputs, outputs, updates=updates, **kwargs)


def temporal_padding(x, padding=(1, 1)):
    assert len(padding) == 2
    num_dynamic_axis = _get_dynamic_axis_num(x)
    assert len(x.shape) == 3 - (1 if num_dynamic_axis > 0 else 0)
    return pad(x, [padding], 'channels_last', num_dynamic_axis)


def _padding(x, pattern, axis):  # pragma: no cover
    base_shape = x.shape
    if b_any([dim < 0 for dim in base_shape]):
        raise ValueError('CNTK Backend: padding input tensor with '
                         'shape `%s` contains non-specified dimension, '
                         'which is not supported. Please give fixed '
                         'dimension to enable padding.' % base_shape)
    if pattern[0] > 0:
        prefix_shape = list(base_shape)
        prefix_shape[axis] = pattern[0]
        prefix_shape = tuple(prefix_shape)
        x = C.splice(C.constant(value=0, shape=prefix_shape), x, axis=axis)
        base_shape = x.shape
    if pattern[1] > 0:
        postfix_shape = list(base_shape)
        postfix_shape[axis] = pattern[1]
        postfix_shape = tuple(postfix_shape)
        x = C.splice(x, C.constant(value=0, shape=postfix_shape), axis=axis)
    return x


def pad(x, pad_info, data_format, num_dynamic_axis):
    if hasattr(C, 'pad'):
        pattern = [list(p) for p in pad_info]
        if data_format == 'channels_first':
            pattern = [[0, 0]] + pattern
        else:
            pattern = pattern + [[0, 0]]
        if num_dynamic_axis == 0:
            pattern = [[0, 0]] + pattern
        return C.pad(x, pattern=pattern)
    else:  # pragma: no cover
        for (a, p) in enumerate(pad_info):
            x = _padding(x, p,
                         a + (1 if num_dynamic_axis == 0 else 0) +
                         (1 if data_format == 'channels_first' else 0))
        return x


def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
    assert len(padding) == 2
    assert len(padding[0]) == 2
    assert len(padding[1]) == 2
    data_format = normalize_data_format(data_format)

    num_dynamic_axis = _get_dynamic_axis_num(x)
    assert len(x.shape) == 4 - (1 if num_dynamic_axis > 0 else 0)
    return pad(x, padding, data_format, num_dynamic_axis)


def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
    assert len(padding) == 3
    assert len(padding[0]) == 2
    assert len(padding[1]) == 2
    assert len(padding[2]) == 2
    data_format = normalize_data_format(data_format)

    num_dynamic_axis = _get_dynamic_axis_num(x)
    assert len(x.shape) == 5 - (1 if num_dynamic_axis > 0 else 0)
    return pad(x, padding, data_format, num_dynamic_axis)


def one_hot(indices, num_classes):
    return C.one_hot(indices, num_classes)


def get_value(x):
    if isinstance(
            x,
            (C.variables.Parameter, C.variables.Constant)):
        return x.value
    else:
        return eval(x)


def batch_get_value(xs):
    result = [get_value(x) for x in xs]

    return result


def set_value(x, value):
    if (isinstance(x, C.variables.Parameter) or
       isinstance(x, C.variables.Constant)):
        if isinstance(value, (float, int)):
            value = np.full(x.shape, value, dtype=floatx())
        x.value = value
    else:
        raise NotImplementedError


def print_tensor(x, message=''):
    return C.user_function(
        LambdaFunc(x,
                   when=lambda x: True,
                   execute=lambda x: print(message)))


def batch_set_value(tuples):
    for t in tuples:
        x = t[0]
        value = t[1]
        if isinstance(value, np.ndarray) is False:
            value = np.asarray(value)
        if isinstance(x, C.variables.Parameter):
            x.value = value
        else:
            raise NotImplementedError


def stop_gradient(variables):
    if isinstance(variables, (list, tuple)):
        return map(C.stop_gradient, variables)
    else:
        return C.stop_gradient(variables)


def switch(condition, then_expression, else_expression):
    if callable(then_expression):
        then_expression = then_expression()
    if callable(else_expression):
        else_expression = else_expression()
    ndim_cond = ndim(condition)
    ndim_expr = ndim(then_expression)
    if ndim_cond > ndim_expr:
        raise ValueError('Rank of condition should be less'
                         ' than or equal to rank of then and'
                         ' else expressions. ndim(condition)=' +
                         str(ndim_cond) + ', ndim(then_expression)'
                         '=' + str(ndim_expr))
    elif ndim_cond < ndim_expr:
        shape_expr = int_shape(then_expression)
        ndim_diff = ndim_expr - ndim_cond
        for i in range(ndim_diff):
            condition = expand_dims(condition)
            condition = tile(condition, shape_expr[ndim_cond + i])
    return C.element_select(condition,
                            then_expression,
                            else_expression)


def elu(x, alpha=1.):
    res = C.elu(x)
    if alpha == 1:
        return res
    else:
        return C.element_select(C.greater(x, 0), res, alpha * res)


def in_top_k(predictions, targets, k):
    _targets = C.one_hot(targets, predictions.shape[-1])
    result = [C.classification_error(predictions[i], _targets[i], topN=k)
              for i in range(predictions.shape[0])]
    result = concatenate(result, axis=-1)
    return 1 - C.reshape(result, shape=(-1,))


def conv2d_transpose(x, kernel, output_shape, strides=(1, 1),
                     padding='valid', data_format=None, dilation_rate=(1, 1)):
    data_format = normalize_data_format(data_format)

    x = _preprocess_conv2d_input(x, data_format)
    kernel = _preprocess_conv2d_kernel(kernel, data_format)
    padding = _preprocess_border_mode(padding)
    strides = (1,) + strides
    # cntk output_shape does not include batch axis
    output_shape = output_shape[1:]
    # in keras2, need handle output shape in different format
    if data_format == 'channels_last':
        output_shape = transpose_shape(output_shape, 'channels_first',
                                       spatial_axes=(0, 1))

    dilation_rate = (1,) + dilation_rate

    x = C.convolution_transpose(
        kernel,
        x,
        strides,
        auto_padding=[
            False,
            padding,
            padding],
        output_shape=output_shape,
        dilation=dilation_rate)
    return _postprocess_conv2d_output(x, data_format)


def identity(x, name=None):
    if name is None:
        name = '%s_alias' % x.name
    return C.alias(x, name=name)


def _preprocess_conv2d_input(x, data_format):
    if data_format == 'channels_last':
        # TF uses the last dimension as channel dimension,
        # instead of the 2nd one.
        # TH input shape: (samples, input_depth, rows, cols)
        # TF input shape: (samples, rows, cols, input_depth)
        x = C.transpose(x, (2, 0, 1))
    return x


def _preprocess_conv2d_kernel(kernel, data_format):
    # As of Keras 2.0.0, all kernels are normalized
    # on the format `(rows, cols, input_depth, depth)`,
    # independently of `data_format`.
    # CNTK expects `(depth, input_depth, rows, cols)`.
    kernel = C.transpose(kernel, (3, 2, 0, 1))
    return kernel


def _preprocess_border_mode(padding):
    if padding == 'same':
        padding = True
    elif padding == 'valid':
        padding = False
    else:
        raise ValueError('Invalid border mode: ' + str(padding))
    return padding


def _postprocess_conv2d_output(x, data_format):
    if data_format == 'channels_last':
        x = C.transpose(x, (1, 2, 0))
    return x


def _preprocess_conv3d_input(x, data_format):
    if data_format == 'channels_last':
        # TF uses the last dimension as channel dimension,
        # instead of the 2nd one.
        # TH input shape: (samples, input_depth, conv_dim1, conv_dim2, conv_dim3)
        # TF input shape: (samples, conv_dim1, conv_dim2, conv_dim3,
        # input_depth)
        x = C.transpose(x, (3, 0, 1, 2))
    return x


def _preprocess_conv3d_kernel(kernel, dim_ordering):
    kernel = C.transpose(kernel, (4, 3, 0, 1, 2))
    return kernel


def _postprocess_conv3d_output(x, dim_ordering):
    if dim_ordering == 'channels_last':
        x = C.transpose(x, (1, 2, 3, 0))
    return x


def _get_dynamic_axis_num(x):
    if hasattr(x, 'dynamic_axes'):
        return len(x.dynamic_axes)
    else:
        return 0


def _contain_seqence_axis(x):
    if _get_dynamic_axis_num(x) > 1:
        return x.dynamic_axes[1] == C.Axis.default_dynamic_axis()
    else:
        return False


def get_num_dynamic_axis(x):
    return _get_dynamic_axis_num(x)


def _reduce_on_axis(x, axis, reduce_fun_name):
    if isinstance(axis, list):
        for a in axis:
            if isinstance(a, C.Axis) \
                    and a != C.Axis.default_batch_axis() \
                    and hasattr(C.sequence, reduce_fun_name):
                x = getattr(C.sequence, reduce_fun_name)(x, a)
            else:
                x = getattr(C, reduce_fun_name)(x, a)
    else:
        x = getattr(C, reduce_fun_name)(x, axis)
    return x


def _reshape_sequence(x, time_step):
    tmp_shape = list(int_shape(x))
    tmp_shape[1] = time_step
    return reshape(x, tmp_shape)


def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
    data_format = normalize_data_format(data_format)

    stride = strides[0]
    kernel_shape = int_shape(kernel)
    output_length, feature_dim, filters = kernel_shape

    xs = []
    for i in range(output_length):
        slice_length = py_slice(i * stride,
                                i * stride + kernel_size[0])
        xs.append(reshape(inputs[:, slice_length, :],
                          (-1, 1, feature_dim)))
    x_aggregate = concatenate(xs, axis=1)
    # transpose kernel to output_filters first, to apply broadcast
    weight = permute_dimensions(kernel, (2, 0, 1))
    # Shape: (batch, filters, output_length, input_length * kernel_size)
    output = x_aggregate * weight
    # Shape: (batch, filters, output_length)
    output = sum(output, axis=3)
    # Shape: (batch, output_length, filters)
    return permute_dimensions(output, (0, 2, 1))


def local_conv2d(inputs,
                 kernel,
                 kernel_size,
                 strides,
                 output_shape,
                 data_format=None):
    data_format = normalize_data_format(data_format)

    stride_row, stride_col = strides
    output_row, output_col = output_shape
    kernel_shape = int_shape(kernel)
    _, feature_dim, filters = kernel_shape
    xs = []

    for i in range(output_row):
        for j in range(output_col):
            slice_row = py_slice(i * stride_row,
                                 i * stride_row + kernel_size[0])
            slice_col = py_slice(j * stride_col,
                                 j * stride_col + kernel_size[1])
            if data_format == 'channels_first':
                xs.append(reshape(inputs[:, :, slice_row, slice_col],
                                  (-1, 1, feature_dim)))
            else:
                xs.append(reshape(inputs[:, slice_row, slice_col, :],
                                  (-1, 1, feature_dim)))
    x_aggregate = concatenate(xs, axis=1)
    # transpose kernel to put filters first
    weight = permute_dimensions(kernel, (2, 0, 1))
    # shape: batch, filters, output_length, input_length * kernel_size
    output = x_aggregate * weight
    # shape: batch, filters, output_length
    output = sum(output, axis=3)
    # shape: batch, filters, row, col
    output = reshape(output,
                     (-1, filters, output_row, output_col))

    if data_format == 'channels_last':
        # shape: batch, row, col, filters
        output = permute_dimensions(output, (0, 2, 3, 1))

    return output


def reverse(x, axes):
    if isinstance(axes, int):
        axes = [axes]
    cntk_axes = _normalize_axis(axes, x)
    begin_index = [0 for _ in cntk_axes]
    end_index = [0 for _ in cntk_axes]
    strides = [-1 for _ in cntk_axes]
    return C.slice(x, cntk_axes, begin_index, end_index, strides)


def slice(x, start, size):
    if not (len(int_shape(x)) == len(start) == len(size)):
        raise ValueError('The dimension and the size of indices should match.')
    out = x[tuple([py_slice(i, i + j) for (i, j) in zip(start, size)])]
    out._keras_shape = tuple(size)
    return out


def _reshape_batch(x, shape):
    # there is a bug in cntk 2.1's unpack_batch implementation
    if hasattr(C, 'unpack_batch') and _get_cntk_version() >= 2.2:
        const_a = C.unpack_batch(x)
        const_a = C.reshape(const_a, shape)
        return C.to_batch(const_a)
    else:
        return C.user_function(ReshapeBatch(x, shape[1:]))


def _get_cntk_version():
    version = C.__version__
    if version.endswith('+'):
        version = version[:-1]
    # for hot fix, ignore all the . except the first one.
    if len(version) > 2 and version[1] == '.':
        version = version[:2] + version[2:].replace('.', '')
    try:
        return float(version)
    except:
        warnings.warn(
            'CNTK backend warning: CNTK version not detected. '
            'Will using CNTK 2.0 GA as default.')
        return float(2.0)


class ReshapeBatch(C.ops.functions.UserFunction):
    def __init__(self, input, shape, name='reshape_with_batch'):
        super(ReshapeBatch, self).__init__([input], as_numpy=False, name=name)
        self.from_shape = input.shape
        self.target_shape = shape

    def infer_outputs(self):
        batch_axis = C.Axis.default_batch_axis()
        return [
            C.output_variable(
                self.target_shape,
                self.inputs[0].dtype,
                [batch_axis])]

    def forward(self, arguments, device=None, outputs_to_retain=None):
        num_element = arguments.shape()[0] * np.prod(np.asarray(self.from_shape))
        num_static_element = np.prod(np.asarray(self.target_shape))
        num_batch = int(num_element / num_static_element)
        result = arguments.data().as_shape((num_batch,) + self.target_shape)
        return None, C.cntk_py.Value(result)

    def backward(self, state, root_gradients):
        grad_array_view = root_gradients.data()
        num_element = root_gradients.shape()[0] * np.prod(
            np.asarray(self.target_shape))
        num_static_element = np.prod(np.asarray(self.from_shape))
        num_old_batch = int(num_element / num_static_element)
        return C.cntk_py.Value(
            grad_array_view.as_shape(
                (num_old_batch,) + self.from_shape))


class ConvertToBatch(C.ops.functions.UserFunction):
    """Converts input first axis to CNTK batch axis.

    We may introduce this operation in CNTK native
    implementation later.

    # Arguments
        inputs: a cntk variable (parameter/constant)
        name: name of this node
    """

    def __init__(self, input, name='convert_to_batch'):
        super(ConvertToBatch, self).__init__([input], as_numpy=False, name=name)

    def infer_outputs(self):
        batch_axis = C.Axis.default_batch_axis()
        return [
            C.output_variable(
                self.inputs[0].shape[1:],
                self.inputs[0].dtype,
                [batch_axis])]

    def forward(self, arguments, device=None, outputs_to_retain=None):
        return None, C.cntk_py.Value(arguments.data())

    def backward(self, state, root_gradients):
        return C.cntk_py.Value(root_gradients.data())


class ConvertToStatic(C.ops.functions.UserFunction):
    """Converts input first axis to CNTK static axis.

    We may introduce this operation in CNTK native
    implementation later.

    # Arguments
        inputs: a cntk tensor which has batch axis
        batch_size: size of batch axis.
        name: name of this node.
    """

    def __init__(self, input, batch_size, name='convert_to_static'):
        super(ConvertToStatic, self).__init__([input], as_numpy=False, name=name)
        self.target_shape = (batch_size,) + input.shape

    def infer_outputs(self):
        return [
            C.output_variable(
                self.target_shape,
                self.inputs[0].dtype,
                [])]

    def forward(self, arguments, device=None, outputs_to_retain=None):
        return None, C.cntk_py.Value(arguments.data())

    def backward(self, state, root_gradients):
        return C.cntk_py.Value(root_gradients.data())


class LambdaFunc(C.ops.functions.UserFunction):
    def __init__(self,
                 arg,
                 when=lambda arg: True,
                 execute=lambda arg: print(arg),
                 name=''):
        self.when = when
        self.execute = execute

        super(LambdaFunc, self).__init__([arg], name=name)

    def infer_outputs(self):
        return [
            C.output_variable(
                self.inputs[0].shape,
                self.inputs[0].dtype,
                self.inputs[0].dynamic_axes)]

    def forward(self, argument, device=None, outputs_to_retain=None):
        if self.when(argument):
            self.execute(argument)

        return None, argument

    def backward(self, state, root_gradients):
        return root_gradients


def reset_uids():
    global _UID_PREFIXES
    _UID_PREFIXES = defaultdict(int)


def to_dense(tensor):
    raise NotImplementedError


def cumsum(x, axis=0):
    dim = x.shape[axis]
    U = C.constant(np.triu(np.ones((dim, dim))).astype(x.dtype))
    if axis != -1:
        x = C.swapaxes(x, -1, axis)
    out = C.times(x, U)
    if axis != -1:
        out = C.swapaxes(out, -1, axis)
    return out


def cumprod(x, axis=0):
    shape = x.shape
    out = x
    for rep in range(shape[axis] - 1):
        sliced_shape = list(shape)
        sliced_shape[axis] = rep + 1
        if axis == 0:
            _x = x[rep:(rep + 1)]
        elif axis == 1:
            _x = x[:, rep:(rep + 1)]
        elif axis == 2:
            _x = x[:, :, rep:(rep + 1)]
        y = concatenate([ones(sliced_shape, dtype=x.dtype),
                         repeat_elements(_x, rep=shape[axis] - 1 - rep, axis=axis)],
                        axis=axis)
        out = C.element_times(out, y)
    return out


def arange(start, stop=None, step=1, dtype='int32'):
    raise NotImplementedError


def ctc_label_dense_to_sparse(labels, label_lengths):
    raise NotImplementedError


def ctc_batch_cost(y_true, y_pred, input_length, label_length):
    raise NotImplementedError


def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1,
               merge_repeated=False):
    raise NotImplementedError


def map_fn(fn, elems, name=None, dtype=None):
    raise NotImplementedError


def foldl(fn, elems, initializer=None, name=None):
    """Reduce `elems` by `fn` combined them from left to right on dimension 0.

    # Arguments
        fn: Callable that will be called upon each element in `elems`
            (and on the optional `initializer`) passed as a second argument.
            The first argument passed to `fn` is the accumulator which is the
            accumulated value calculated from the preceding invocation of `fn`.
            Example For `fn`:
            ```python
            lambda acc, x: acc + x
            ```
        elems: Tensor
        initializer: (optional) Tensor, the initial value for the accumulator.
            In case of None value is provided during the call
            the first value is used (`elems[0]`) as `initializer` from `elems`
        name: (optional) String, name for the foldl node in the graph.

    # Returns
        Same type and shape as `initializer`

    # Raises:
        TypeError: if `fn` is not callable.
        TypeError: if `initializer` is neither a tensor nor None value.
        TypeError: if `elems` is not a tensor.
    """
    if not callable(fn):
        raise TypeError("`fn` must be callable.")
    if initializer is not None and not is_tensor(initializer):
        raise TypeError("`initializer` must be a tensor or None")
    if not is_tensor(elems):
        raise TypeError('`elems` must be a tensor')

    if initializer is None and shape(elems)[0] > 1:
        initializer = elems[0]
        elems = elems[1:]
    elif initializer is None:
        initializer = elems[0]
        elems = None

    accumulator = initializer
    if elems is not None:
        for i in range(shape(elems)[0]):
            accumulator = fn(accumulator, elems[i])

    if name is not None:
        accumulator.name = str(name)

    return reshape(accumulator, shape(initializer)[1:])


def foldr(fn, elems, initializer=None, name=None):
    """Reduce `elems` by `fn` combined them from right to left on dimension 0.

    # Arguments
        fn: Callable that will be called upon each element in `elems`
            (and on the optional `initializer`) passed as a second argument.
            The first argument passed to `fn` is the accumulator which is the
            accumulated value calculated from the preceding invocation of `fn`.
            Example For `fn`:
            ```python
            lambda acc, x: acc + x
            ```
        elems: Tensor
        initializer: (optional) Tensor, the initial value for the accumulator.
            In case of None value is provided during the call
            the last value is used (`elems[-1]`) as `initializer` from `elems`
        name: (optional) String, name for the foldr node in the graph.

    # Returns
        Same type and shape as `initializer`

    # Raises:
        TypeError: if `fn` is not callable.
        TypeError: if `initializer` is neither a tensor nor None value.
        TypeError: if `elems` is not a tensor.
    """
    if not callable(fn):
        raise TypeError("`fn` must be callable.")
    if initializer is not None and not is_tensor(initializer):
        raise TypeError("`initializer` must be a tensor or None")
    if not is_tensor(elems):
        raise TypeError('`elems` must be a tensor')

    if initializer is None and shape(elems)[0] > 1:
        initializer = elems[-1]
        elems = elems[:-1]
    elif initializer is None:
        initializer = elems[0]
        elems = None

    accumulator = initializer
    if elems is not None:
        for i in range(shape(elems)[0]):
            accumulator = fn(accumulator, elems[-i])

    if name is not None:
        accumulator.name = str(name)

    return reshape(accumulator, shape(initializer)[1:])


def control_dependencies(control_inputs):
    @contextmanager
    def nullcontextmanager():
        yield

    return nullcontextmanager()