/*******************************************************************************
* Copyright 2018-2019 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

/// @example cpu_cnn_inference_int8.cpp
/// @copybrief cpu_cnn_inference_int8_cpp
/// > Annotated version: @ref cpu_cnn_inference_int8_cpp

/// @page cpu_cnn_inference_int8_cpp CNN int8 inference example
/// This C++ API example demonstrates how to run AlexNet's conv3 and relu3
/// with int8 data type.
///
/// > Example code: @ref cpu_cnn_inference_int8.cpp

import org.bytedeco.javacpp.*;

import org.bytedeco.dnnl.*;
import static org.bytedeco.dnnl.global.dnnl.*;

public class CpuCnnInferenceInt8 {

    static long product(long[] dims) {
        long accumulate = 1;
        for (int i = 0; i < dims.length; i++) accumulate *= dims[i];
        return accumulate;
    }

    static void simple_net_int8() throws Exception {
        engine cpu_engine = new engine(engine.kind.cpu, 0);
        stream s = new stream(cpu_engine);

        int batch = 8;

/// Configure tensor shapes
/// @snippet cpu_cnn_inference_int8.cpp Configure tensor shapes
//[Configure tensor shapes]
        // AlexNet: conv3
        // {batch, 256, 13, 13} (x)  {384, 256, 3, 3}; -> {batch, 384, 13, 13}
        // strides: {1, 1}
        long[] conv_src_tz = { batch, 256, 13, 13 };
        long[] conv_weights_tz = { 384, 256, 3, 3 };
        long[] conv_bias_tz = { 384 };
        long[] conv_dst_tz = { batch, 384, 13, 13 };
        long[] conv_strides = { 1, 1 };
        long[] conv_padding = { 1, 1 };
//[Configure tensor shapes]

/// Next, the example configures the scales used to quantize f32 data
/// into int8. For this example, the scaling value is chosen as an
/// arbitrary number, although in a realistic scenario, it should be
/// calculated from a set of precomputed values as previously mentioned.
/// @snippet cpu_cnn_inference_int8.cpp Choose scaling factors
//[Choose scaling factors]
        // Choose scaling factors for input, weight, output and bias quantization
        float[] src_scales = { 1.8f };
        float[] weight_scales = { 2.0f };
        float[] bias_scales = { 1.0f };
        float[] dst_scales = { 0.55f };

        // Choose channel-wise scaling factors for convolution
        float[] conv_scales = new float[384];
        int scales_half = 384 / 2;
        for (int i = 0;               i < scales_half;        i++) conv_scales[i] = 0.3f;
        for (int i = scales_half + 1; i < conv_scales.length; i++) conv_scales[i] = 0.8f;
//[Choose scaling factors]

/// The *source, weights, bias* and *destination* datasets use the single-scale
/// format with mask set to '0', while the *output* from the convolution
/// (conv_scales) will use the array format where mask = 2 corresponding
/// to the output dimension.
/// @snippet cpu_cnn_inference_int8.cpp Set scaling mask
//[Set scaling mask]
        int src_mask = 0;
        int weight_mask = 0;
        int bias_mask = 0;
        int dst_mask = 0;
        int conv_mask = 2; // 1 << output_channel_dim
//[Set scaling mask]

        // Allocate input and output buffers for user data
        float[] user_src = new float[batch * 256 * 13 * 13];
        float[] user_dst = new float[batch * 384 * 13 * 13];

        // Allocate and fill buffers for weights and bias
        float[] conv_weights = new float[(int)product(conv_weights_tz)];
        float[] conv_bias = new float[(int)product(conv_bias_tz)];

/// Create the memory primitives for user data (source, weights, and bias).
/// The user data will be in its original 32-bit floating point format.
/// @snippet cpu_cnn_inference_int8.cpp Allocate buffers
//[Allocate buffers]
        memory user_src_memory = new memory(
                new memory.desc(conv_src_tz, memory.data_type.f32, memory.format_tag.nchw),
                cpu_engine, new FloatPointer(user_src));
        memory user_weights_memory = new memory(
                new memory.desc(conv_weights_tz, memory.data_type.f32, memory.format_tag.oihw),
                cpu_engine, new FloatPointer(conv_weights));
        memory user_bias_memory = new memory(
                new memory.desc(conv_bias_tz, memory.data_type.f32, memory.format_tag.x),
                cpu_engine, new FloatPointer(conv_bias));
//[Allocate buffers]

/// Create a memory descriptor for each convolution parameter.
/// The convolution data uses 8-bit integer values, so the memory
/// descriptors are configured as:
///
/// * 8-bit unsigned (u8) for source and destination.
/// * 8-bit signed (s8) for bias and weights.
///
///  > **Note**
///  > The destination type is chosen as *unsigned* because the
///  > convolution applies a ReLU operation where data results \f$\geq 0\f$.
/// @snippet cpu_cnn_inference_int8.cpp Create convolution memory descriptors
//[Create convolution memory descriptors]
        memory.desc conv_src_md = new memory.desc(conv_src_tz, memory.data_type.u8, memory.format_tag.any);
        memory.desc conv_bias_md = new memory.desc(conv_bias_tz, memory.data_type.s8, memory.format_tag.any);
        memory.desc conv_weights_md = new memory.desc(conv_weights_tz, memory.data_type.s8, memory.format_tag.any);
        memory.desc conv_dst_md = new memory.desc(conv_dst_tz, memory.data_type.u8, memory.format_tag.any);
//[Create convolution memory descriptors]

/// Create a convolution descriptor passing the int8 memory
/// descriptors as parameters.
/// @snippet cpu_cnn_inference_int8.cpp Create convolution descriptor
//[Create convolution descriptor]
        convolution_forward.desc conv_desc = new convolution_forward.desc(prop_kind.forward,
                algorithm.convolution_direct, conv_src_md, conv_weights_md, conv_bias_md,
                conv_dst_md, conv_strides, conv_padding, conv_padding);
//[Create convolution descriptor]

/// Configuring int8-specific parameters in an int8 primitive is done
/// via the Attributes Primitive. Create an attributes object for the
/// convolution and configure it accordingly.
/// @snippet cpu_cnn_inference_int8.cpp Configure scaling
//[Configure scaling]
        primitive_attr conv_attr = new primitive_attr();
        conv_attr.set_output_scales(conv_mask, conv_scales);
//[Configure scaling]

/// The ReLU layer from Alexnet is executed through the PostOps feature. Create
/// a PostOps object and configure it to execute an _eltwise relu_ operation.
/// @snippet cpu_cnn_inference_int8.cpp Configure post-ops
//[Configure post-ops]
        float ops_scale = 1.f;
        float ops_alpha = 0.f; // relu negative slope
        float ops_beta = 0.f;
        post_ops ops = new post_ops();
        ops.append_eltwise(ops_scale, algorithm.eltwise_relu, ops_alpha, ops_beta);
        conv_attr.set_post_ops(ops);
//[Configure post-ops]

        // check if int8 convolution is supported
        try {
            convolution_forward.primitive_desc conv_prim_desc = new convolution_forward.primitive_desc(
                    conv_desc, conv_attr, cpu_engine);
        } catch (Exception e) {
            if (e.getMessage().contains("status = " + dnnl_unimplemented)) {
                System.err.println("Intel DNNL does not have int8 convolution "
                                 + "implementation that supports this system. Please refer to "
                                 + "the developer guide for details.");
            }
            throw e;
        }

/// Create a primitive descriptor using the convolution descriptor
/// and passing along the int8 attributes in the constructor. The primitive
/// descriptor for the convolution will contain the specific memory
/// formats for the computation.
/// @snippet cpu_cnn_inference_int8.cpp Create convolution primitive descriptor
//[Create convolution primitive descriptor]
        convolution_forward.primitive_desc conv_prim_desc = new convolution_forward.primitive_desc(
                conv_desc, conv_attr, cpu_engine);
//[Create convolution primitive descriptor]

/// Create a memory for each of the convolution's data input
/// parameters (source, bias, weights, and destination). Using the convolution
/// primitive descriptor as the creation parameter enables Intel DNNL
/// to configure the memory formats for the convolution.
///
/// Scaling parameters are passed to the reorder primitive via the attributes
/// primitive.
///
/// User memory must be transformed into convolution-friendly memory
/// (for int8 and memory format). A reorder layer performs the data
/// transformation from f32 (the original user data) into int8 format
/// (the data used for the convolution). In addition, the reorder
/// transforms the user data into the required memory format (as explained
/// in the simple_net example).
///
/// @snippet cpu_cnn_inference_int8.cpp Quantize data and weights
//[Quantize data and weights]
        memory conv_src_memory = new memory(conv_prim_desc.src_desc(), cpu_engine);
        primitive_attr src_attr = new primitive_attr();
        src_attr.set_output_scales(src_mask, src_scales);
        reorder.primitive_desc src_reorder_pd = new reorder.primitive_desc(cpu_engine,
                user_src_memory.get_desc(), cpu_engine,
                conv_src_memory.get_desc(), src_attr);
        reorder src_reorder = new reorder(src_reorder_pd);
        src_reorder.execute(s, user_src_memory, conv_src_memory);

        memory conv_weights_memory = new memory(conv_prim_desc.weights_desc(), cpu_engine);
        primitive_attr weight_attr = new primitive_attr();
        weight_attr.set_output_scales(weight_mask, weight_scales);
        reorder.primitive_desc weight_reorder_pd = new reorder.primitive_desc(cpu_engine,
                user_weights_memory.get_desc(), cpu_engine,
                conv_weights_memory.get_desc(), weight_attr);
        reorder weight_reorder = new reorder(weight_reorder_pd);
        weight_reorder.execute(s, user_weights_memory, conv_weights_memory);

        memory conv_bias_memory = new memory(conv_prim_desc.bias_desc(), cpu_engine);
        primitive_attr bias_attr = new primitive_attr();
        bias_attr.set_output_scales(bias_mask, bias_scales);
        reorder.primitive_desc bias_reorder_pd = new reorder.primitive_desc(cpu_engine,
                user_bias_memory.get_desc(), cpu_engine,
                conv_bias_memory.get_desc(), bias_attr);
        reorder bias_reorder = new reorder(bias_reorder_pd);
        bias_reorder.execute(s, user_bias_memory, conv_bias_memory);
//[Quantize data and weights]

        memory conv_dst_memory = new memory(conv_prim_desc.dst_desc(), cpu_engine);

/// Create the convolution primitive and add it to the net. The int8 example
/// computes the same Convolution +ReLU layers from AlexNet simple-net.cpp
/// using the int8 and PostOps approach. Although performance is not
/// measured here, in practice it would require less computation time to achieve
/// similar results.
/// @snippet cpu_cnn_inference_int8.cpp Create convolution primitive
//[Create convolution primitive]
        convolution_forward conv = new convolution_forward(conv_prim_desc);
        conv.execute(s, new IntMemoryMap()
                .put(DNNL_ARG_SRC, conv_src_memory)
                .put(DNNL_ARG_WEIGHTS, conv_weights_memory)
                .put(DNNL_ARG_BIAS, conv_bias_memory)
                .put(DNNL_ARG_DST, conv_dst_memory));
//[Create convolution primitive]

/// @page cpu_cnn_inference_int8_cpp
/// Finally, *dst memory* may be dequantized from int8 into the original
/// f32 format. Create a memory primitive for the user data in the original
/// 32-bit floating point format and then apply a reorder to transform the
/// computation output data.
/// @snippet cpu_cnn_inference_int8.cpp Dequantize the result
//[Dequantize the result]
        memory user_dst_memory = new memory(
                new memory.desc(conv_dst_tz, memory.data_type.f32, memory.format_tag.nchw),
                cpu_engine, new FloatPointer(user_dst));
        primitive_attr dst_attr = new primitive_attr();
        dst_attr.set_output_scales(dst_mask, dst_scales);
        reorder.primitive_desc dst_reorder_pd = new reorder.primitive_desc(cpu_engine,
                conv_dst_memory.get_desc(), cpu_engine,
                user_dst_memory.get_desc(), dst_attr);
        reorder dst_reorder = new reorder(dst_reorder_pd);
        dst_reorder.execute(s, conv_dst_memory, user_dst_memory);
//[Dequantize the result]

        s._wait();
    }

    public static void main(String[] args) throws Exception {
        try {
            simple_net_int8();
            System.out.println("Simple-net-int8 example passed!");
        } catch (Exception e) {
            System.err.println("exception: " + e);
        }
        System.exit(0);
    }
}