/*
 * Copyright 2017-2017 Spotify AB
 * Copyright 2017-2019 The Last Pickle Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.cassandrareaper.service;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.Arrays;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;

import javax.management.JMException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.codahale.metrics.Gauge;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;

import io.cassandrareaper.AppContext;
import io.cassandrareaper.ReaperApplicationConfiguration.DatacenterAvailability;
import io.cassandrareaper.ReaperException;
import io.cassandrareaper.core.Cluster;
import io.cassandrareaper.core.NodeMetrics;
import io.cassandrareaper.jmx.ClusterFacade;
import io.cassandrareaper.jmx.JmxProxy;
import io.cassandrareaper.jmx.JmxProxyImpl;
import io.cassandrareaper.service.RepairManager.MetricsManager;
import io.cassandrareaper.storage.CassandraStorage;
import io.cassandrareaper.storage.IDistributedStorage;
import io.cassandrareaper.storage.IStorage;
import io.cassandrareaper.storage.OpType;


final class Heart implements AutoCloseable {

  private static final AtomicBoolean GAUGES_REGISTERED = new AtomicBoolean(false);
  private static final Logger LOG = LoggerFactory.getLogger(Heart.class);
  private static final long DEFAULT_MAX_FREQUENCY = TimeUnit.SECONDS.toMillis(30);

  private final AtomicLong lastBeat = new AtomicLong(System.currentTimeMillis() - TimeUnit.HOURS.toMillis(1));
  private final AtomicLong lastMetricBeat = new AtomicLong(System.currentTimeMillis() - TimeUnit.HOURS.toMillis(1));
  private final ForkJoinPool forkJoinPool = new ForkJoinPool(64);
  private final AppContext context;
  private final MetricsService metricsService;
  private final long maxBeatFrequencyMillis;
  private final AtomicBoolean updatingNodeMetrics = new AtomicBoolean(false);
  private static long myLastFlagDrivenTerminateAll = System.currentTimeMillis();
  public static final String REPAIR_CONFLICT_FILE = "/var/local/cassandra/incremental_repair_conflict.txt";

  public static final String UPGRADE_REQUESTED_FILE = "/var/local/cassandra/nodetool_upgradesstables_requested";
  public static final String UPGRADE_IN_FLIGHT_FILE = "/var/local/cassandra/nodetool_upgradesstables_in_progress";

  private Heart(AppContext context, long maxBeatFrequency) throws ReaperException {
    this.context = context;
    this.maxBeatFrequencyMillis = maxBeatFrequency;
    this.metricsService = MetricsService.create(context);
  }

  static Heart create(AppContext context) throws ReaperException {
    return new Heart(context, DEFAULT_MAX_FREQUENCY);
  }

  @VisibleForTesting
  static Heart create(AppContext context, long maxBeatFrequencyMillis) throws ReaperException {
    return new Heart(context, maxBeatFrequencyMillis);
  }

  synchronized void beat() {
    if (context.storage instanceof IDistributedStorage
        && lastBeat.get() + maxBeatFrequencyMillis < System.currentTimeMillis()) {

      lastBeat.set(System.currentTimeMillis());
      ((IDistributedStorage) context.storage).saveHeartbeat();
    }
  }

  synchronized void beatMetrics() {
    if (context.storage instanceof IDistributedStorage
            && context.config.getDatacenterAvailability().isInCollocatedMode()) {
      updateRequestedNodeMetrics();
    }
  }

  AtomicBoolean isCurrentlyUpdatingNodeMetrics() {
    return new AtomicBoolean(updatingNodeMetrics.get());
  }

  @Override
  public void close() {
    try {
      forkJoinPool.shutdown();
      forkJoinPool.awaitTermination(10, TimeUnit.SECONDS);
    } catch (InterruptedException ignore) {
    } finally {
      forkJoinPool.shutdownNow();
    }
  }

  private void updateRequestedNodeMetrics() {
    Preconditions.checkArgument(context.storage instanceof IDistributedStorage);
    //IDistributedStorage storage = ((IDistributedStorage) context.storage);
    registerGauges();
    LOG.info("Updating heartbeat");

    if (!updatingNodeMetrics.getAndSet(true)) {
      forkJoinPool.submit(() -> {
        try (Timer.Context t0 = timer(context, "updatingNodeMetrics")) {

          forkJoinPool.submit(() -> {
            context.repairManager.repairRunners.keySet()
                .parallelStream()
                .forEach(runId -> {

                  ((IDistributedStorage) context.storage).getNodeMetrics(runId)
                      .parallelStream()
                      .filter(metrics -> canAnswerToNodeMetricsRequest(metrics))
                      .forEach(req -> {

                        LOG.info("Metric request");
                        try (Timer.Context t1 = timer(
                            context,
                            req.getCluster().replace('.', '-'),
                            req.getNode().replace('.', '-'))) {

                          try {
                            grabAndStoreNodeMetrics(context.storage, runId, req);

                            LOG.info("Metric response");
                          } catch (ReaperException | RuntimeException | InterruptedException ex) {
                            LOG.debug("failed seed connection in cluster " + req.getCluster(), ex);
                          } catch (JMException e) {
                            LOG.warn(
                                "failed querying JMX MBean for metrics on node {} of cluster {} due to {}",
                                req.getNode(), req.getCluster(), e.getMessage());
                          }
                        }
                      });
                });
          }).get();

          if (context.config.getDatacenterAvailability() == DatacenterAvailability.SIDECAR) {
            // In sidecar mode we store metrics in the db on a regular basis
            if (lastMetricBeat.get() + maxBeatFrequencyMillis <= System.currentTimeMillis()) {
              //metricsService.grabAndStoreGenericMetrics();
              lastMetricBeat.set(System.currentTimeMillis());
            } else {
              LOG.trace("Not storing metrics yet... Last beat was {} and now is {}",
                  lastMetricBeat.get(),
                  System.currentTimeMillis());
            }
            metricsService.grabAndStoreActiveCompactions();
            try {
               metricsService.grabAndStoreActiveStreams();
            } catch (Exception e) {
               // no one cares
            }

            String repairConflict = REPAIR_CONFLICT_FILE;
            File repairConflictFile = new File(repairConflict);
            if (repairConflictFile.exists() && repairConflictFile.length() > 0) {
               // I have seen instances where this file gets stuck and cannot be deleted.  So lets make sure that the request is from the last hour
               // before we act on it.
               boolean shouldCancel = true;
               try {
                       BufferedReader br = new BufferedReader(new FileReader(repairConflict));
                       long asDate = Long.parseLong(br.readLine());
                       long howLongAgo = System.currentTimeMillis() - asDate;
                       LOG.info("See cancellation file containing age " + howLongAgo);
                       // I've seen permissions problems where the cancel file is undeletable by reaper.  Normally that doesn't happen
                       // but I think there's a race condition with both reaper and miscd writing it at once.  As such - if it winds up
                       // in that state - this will unstick things after an hour
                       if (howLongAgo > 60*60*1000) {
                               shouldCancel = false;
                       }
                       br.close();
               } catch (Exception e) {

               }
               if (shouldCancel) {
                      MetricsManager.getInstance().addStat("storagegrid_private_repair_incremental_cancel_triggered{source=\"file\"}", 1, false);
                      UUID lastUsed = SegmentRunner.lastUsedLeaderElectionId;
                      if (lastUsed != null) {
                          ((CassandraStorage) context.storage).takeLead(lastUsed);
                          ((CassandraStorage) context.storage).renewLead(lastUsed);
                      }
                      // Lets throw the flag that says we have incremental repair conflict and everyone needs
                      // to back out and terminate everything
                      MetricsManager.getInstance().addStat("storagegrid_private_repair_broadcast_terminate{source=\"file\"}", 1, false);
                      LOG.info("Broadcasting request for everyone to terminate in Heart");
                      ((IDistributedStorage) context.storage).storeOperations(MetricsService.BROADCAST_CLUSTER,
                          OpType.OP_BROADCAST_TERMINATE_ALL,
                          MetricsService.BROADCAST_HOST,
                          new ObjectMapper().writeValueAsString(System.currentTimeMillis()));
                      JmxProxyImpl.getInstance().cancelAllRepairs();
                      try {
                              // write it back to zero byte saying we're done with it.  While this is goofy - cassandra writes the file and reaper reads it
                              // and no matter what permissions I set - I can't get reaper to delete a file made by cassandra.
                              FileWriter fw = new FileWriter(repairConflictFile);
                              fw.close();
                      } catch (Exception e) {

                      }
               }
            }

            if (new File(UPGRADE_REQUESTED_FILE).exists() || new File(UPGRADE_IN_FLIGHT_FILE).exists()) {
               MetricsManager.getInstance().addStat("storagegrid_private_repair_upgrade_detected", 1, false);
               LOG.info("Noting in-progress upgrade to everyone");
               ((IDistributedStorage) context.storage).storeOperations(MetricsService.BROADCAST_CLUSTER,
                        OpType.OP_UPGRADE_DETECTED,
                        MetricsService.BROADCAST_HOST,
                        new ObjectMapper().writeValueAsString(System.currentTimeMillis()));
            }


            String asStoredOps = ((IDistributedStorage) context.storage).listOperations(MetricsService.BROADCAST_CLUSTER,
                    OpType.OP_BROADCAST_TERMINATE_ALL,
                    MetricsService.BROADCAST_HOST);
            try {
               long timeOfLastFlagThrow = Long.parseLong(asStoredOps);
               if (myLastFlagDrivenTerminateAll < timeOfLastFlagThrow) {
                       LOG.info("Received request for everyone to terminate");
                       // someone wants us to terminate all since we last did it.
                       MetricsManager.getInstance().addStat("storagegrid_private_repair_incremental_cancel_triggered{source=\"broadcast\"}", 1, false);
                       myLastFlagDrivenTerminateAll = timeOfLastFlagThrow;
                       JmxProxyImpl.getInstance().cancelAllRepairs();
               }
            } catch (Exception e) {
               // no problem
            }
          }
        } catch (Exception ex) {
          LOG.warn("Failed metric collection during heartbeat", ex);
        } finally {
          assert updatingNodeMetrics.get();
          updatingNodeMetrics.set(false);
        }
      });
    }
  }

  /**
   * Checks if the local Reaper instance is supposed to answer a metrics request.
   * Requires to be in sidecar on the node for which metrics are requested, or to be in a different mode than ALL.
   * Also checks that the metrics record as requested set to true.
   *
   * @param metric a metric request
   * @return true if reaper should try to answer the metric request
   */
  private boolean canAnswerToNodeMetricsRequest(NodeMetrics metric) {
    return (context.config.getDatacenterAvailability() == DatacenterAvailability.SIDECAR
            && metric.getNode().equals(context.getLocalNodeAddress()))
        || (context.config.getDatacenterAvailability() != DatacenterAvailability.ALL
        && context.config.getDatacenterAvailability() != DatacenterAvailability.SIDECAR)
        && metric.isRequested();
  }

  private void grabAndStoreNodeMetrics(IStorage storage, UUID runId, NodeMetrics req)
      throws ReaperException, InterruptedException, JMException {

    Cluster cluster = storage.getCluster(req.getCluster());
    JmxProxy nodeProxy = ClusterFacade.create(context).connect(cluster, Arrays.asList(req.getNode()));

    // this is a hack - but stuff acceptable progress metric into the unused activeanticompaction field
    int acceptableProgressTime = 0;
    String filename = "/var/lib/cassandra/reaper_progress";
    try {
       if (new File(filename).exists()) {
               BufferedReader br = new BufferedReader(new FileReader(filename));
               try {
                       acceptableProgressTime = Integer.parseInt(br.readLine());
               } catch (Exception e2) {

               }
               br.close();
       }
    } catch (Exception e) {

    }


    ((IDistributedStorage) storage).storeNodeMetrics(
        runId,
        NodeMetrics.builder()
            .withNode(req.getNode())
            .withCluster(req.getCluster())
            .withDatacenter(req.getDatacenter())
            .withPendingCompactions(nodeProxy.getPendingCompactions())
            .withHasRepairRunning(nodeProxy.isRepairRunning())
            .withActiveAnticompactions(acceptableProgressTime) // for future use
            .build());
  }

  private static Timer.Context timer(AppContext context, String... names) {
    return context.metricRegistry.timer(MetricRegistry.name(Heart.class, names)).time();
  }

  private void registerGauges() throws IllegalArgumentException {
    if (!GAUGES_REGISTERED.getAndSet(true)) {

      context.metricRegistry.register(
          MetricRegistry.name(Heart.class, "runningThreadCount"),
          (Gauge<Integer>) () -> forkJoinPool.getRunningThreadCount());

      context.metricRegistry.register(
          MetricRegistry.name(Heart.class, "activeThreadCount"),
          (Gauge<Integer>) () -> forkJoinPool.getActiveThreadCount());

      context.metricRegistry.register(
          MetricRegistry.name(Heart.class, "queuedTaskCount"),
          (Gauge<Long>) () -> forkJoinPool.getQueuedTaskCount());

      context.metricRegistry.register(
          MetricRegistry.name(Heart.class, "queuedSubmissionCount"),
          (Gauge<Integer>) () -> forkJoinPool.getQueuedSubmissionCount());
    }
  }
}


