#!/usr/bin/env perl

# Copyright 2019 Imply Data, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

use strict;
use warnings;
use Cwd qw/realpath/;
use POSIX qw/:sys_wait_h mkfifo setsid/;
use Fcntl qw/:DEFAULT :flock/;
use Getopt::Long qw/:config require_order gnu_compat/;
use FindBin;
use File::Spec;
use File::Copy;

sub logdie($)
{
  my ($msg) = @_;
  chomp $msg;
  die "[" . (scalar localtime()) . "] $msg\n";
}

sub logit($)
{
  my ($msg) = @_;
  chomp $msg;
  warn "[" . (scalar localtime()) . "] $msg\n";
}

sub usage
{
  die "usage: $0 -c <conf file> [-d <var dir>] [-t <kill timeout>] [--daemonize] [--svlogd-conf <conf file>]\n";
}

sub daemonize
{
  my ($svdir) = @_;
  my $logfile = "$svdir/supervise.log";
  open STDIN, '/dev/null' or die "Can't read /dev/null: $!\n";

  if (-e "$logfile") {
    rename "$logfile", "$logfile.0";
  }

  open STDOUT, '>', "$logfile" or die "Can't write to $logfile: $!\n";
  open STDERR, '>', "$logfile" or die "Can't write to $logfile: $!\n";
  defined (my $pid = fork) or die "Can't fork: $!\n";
  exit if $pid;
  setsid;
}

sub read_config_file
{
  my ($config_file) = @_;

  open my $config_fh, "<", $config_file
    or die "open $config_file: $!";

  my @commands;
  my @verify;
  my $kill_timeout;
  while (my $line = <$config_fh>) {
    chomp $line;
    next if $line =~ /^(\s*\#.*|\s*)$/;

    if ($line =~ /^(:verify|:kill-timeout|(?:\!p[0-9]+\s+)?[^:]\S+)\s+(.+)$/) {
      my $name = $1;
      my $order = 50;
      my $command = $2;

      if ($name =~ /^(?:\!p([0-9]+)\s+)(.*)$/) {
        $order = $1;
        $name = $2;
      }

      if ($name eq ':verify') {
        push @verify, $command;
      } elsif ($name eq ':kill-timeout') {
        $kill_timeout = int($command);
      } else {
        die "Duplicate command: $line\n" if grep { $_->{name} eq $name } @commands;
        push @commands, {
          name => $name,
          command => $command,
          order => $order,  # Stop order for this command
          pid => 0,         # Current pid, or 0 if not running
          down => 0,        # Time the proc should be down until
          killed => 0,      # Signal we sent to this process
          restarting => 0,  # True if this command is currently restarting
        };
      }
    } else {
      die "Syntax error: $line\n";
    }
  }

  close $config_fh;
  return { commands => \@commands, verify => \@verify, 'kill-timeout' => $kill_timeout };
}

sub stringify_exit_status
{
  my ($status) = @_;
  my $string;
  my $signal = $status & 127;
  my $cored = $status & 128;
  my $code = $status >> 8;

  if ($signal) {
    $string = "signal = $signal";
  } else {
    $string = "exited = $code";
  }

  if ($cored) {
    $string = $string . ", dumped core";
  }

  return $string;
}

sub open_control_fifo
{
  my ($svdir) = @_;
  my $fifofile = "$svdir/.ctrl";
  if (-e $fifofile) {
    unlink $fifofile or die "Cannot remove fifo: $fifofile\n";
  }
  mkfifo($fifofile, 0700) or die "Cannot create fifo: $fifofile\n";
  sysopen my $fifofh, $fifofile, O_NONBLOCK | O_RDWR or die "Cannot open fifo for reading: $fifofile\n";
  return $fifofh;
}

sub pretty
{
  my ($text, $color) = @_;
  if (-t STDERR) {
    if ($color eq 'bold') {
      return "\x1b[1m$text\x1b[0m";
    } elsif ($color eq 'red') {
      return "\x1b[31m\x1b[1m$text\x1b[0m";
    } else {
      return $text;
    }
  } else {
    return $text;
  }
}

my @commands;

# If nonzero we should be exiting. -1 means exit without signal, >0 means exit with signal
my $killed = 0;

# If >0 then kill -9 all procs at this time
my $killkill = 0;

# Current proc order we're stopping. Ignored unless $killed is nonzero
my $stopping = 100;

# We'll do our own reaping
$SIG{CHLD} = sub {};

# Redirect stderr to stdout
open STDERR, ">&STDOUT" or die;

# Parse arguments
my %opt = (
  'chdir' => realpath("$FindBin::Bin/.."),
  'vardir' => realpath("$FindBin::Bin/../var"),
  'kill-timeout' => 360,
);

# The svlogd-conf string is optional, for compatibility with older syntax that used --svlogd to enable it
usage() unless GetOptions(
  \%opt,
  'conf|c=s',
  'vardir|d=s',
  'kill-timeout|t=i',
  'chdir=s',
  'svlogd-conf:s',
  'daemonize'
);

usage() unless $opt{'conf'} && $opt{'vardir'};

# Read config file
my $config = read_config_file($opt{'conf'});
@commands = @{$config->{commands}};

if (!@commands) {
  die "Nothing to run.\n";
}

# Potentially override --kill-timeout
if (defined $config->{'kill-timeout'}) {
  $opt{'kill-timeout'} = $config->{'kill-timeout'};
}

# Remember where vardir, svdir are after chdiring
my $vardir = File::Spec->rel2abs($opt{vardir});
my $svdir = "$vardir/sv";

# chdir to the root of the distribution (or whereever)
chdir($opt{chdir}) or die "chdir[$opt{chdir}] failed: $!\n";

# Clear umask
umask 0;

# Create vardir with tmp/
if (! -e "$vardir/tmp") {
  system("mkdir -p \Q$vardir\E/tmp") == 0 or die "mkdir $vardir/tmp failed: $!\n";
}

# Create svdir
if (! -e $svdir) {
  system("mkdir -p \Q$svdir\E") == 0 or die "mkdir $svdir failed: $!\n";
}

# Lock svdir and keep it locked until we exit
my $lockfile = "$svdir/.lock";
open my $lockfh, ">", $lockfile or die "Cannot write to svdir, please check permissions: $svdir\n";
flock($lockfh, LOCK_EX | LOCK_NB) or die "Cannot lock svdir, maybe another 'supervise' is running: $svdir\n";

# Create control fifo in svdir
my $fifofh = open_control_fifo($svdir);

# Run verification commands
for my $verify_cmd (@{$config->{verify}}) {
  system($verify_cmd) == 0 or exit 1;
}

if ($opt{daemonize}) {
  daemonize($svdir);
}

# Catch killy signals and do an orderly shutdown
$SIG{HUP} = sub { if (!$killed) { $killed = 1; $killkill = time + $opt{'kill-timeout'}; } };
$SIG{INT} = sub { if (!$killed) { $killed = 2; $killkill = time + $opt{'kill-timeout'}; } };
$SIG{TERM} = sub { if (!$killed) { $killed = 15; $killkill = time + $opt{'kill-timeout'}; } };

# Build up control fifo command over multiple sysreads, potentially
my $fifobuffer = '';

while (1) {
  # Spawn new procs
  if (!$killed) {
    for my $command (grep { !$_->{pid} } @commands) {
      if ($command->{down} < time) {
        my $logdir = "$svdir/$command->{name}";

        logit "Running command[" . pretty($command->{name}, 'bold') . "], logging to[$logdir]: $command->{command}";

        if (my $pid = fork) {
          $command->{pid} = $pid;
          $command->{logdir} = $logdir;
        } else {
          setsid;

          if (! -e $logdir) {
            system("mkdir -p \Q$logdir\E") == 0 or logdie "mkdir $logdir failed: $!\n";
          }

          if ($opt{'svlogd-conf'}) {
            copy($opt{'svlogd-conf'}, "$logdir/config") or logdie "Failed copying $opt{'svlogd-conf'} to $logdir/config: $!";
          } else {
            open my $configfh, ">", "$logdir/config" or logdie "Cannot write svlogd config, please check permissions: $logdir/config\n";
            print $configfh "s100000000\nn10\nN5\nt604800";
            close $configfh or logdie "Cannot save svglod config: $!: $logdir/config\n";
          }

          exec('bin/logger', $logdir, $command->{command}) or logdie "exec [$command->{command}] failed: $!";
        }
      }
    }
  }

  # Reap dead procs
  my $pid;
  while (($pid = waitpid(-1, WNOHANG)) > 0) {
    my $status = $?;
    my ($command) = (grep { $_->{pid} eq $pid } @commands);
    if ($command) {
      $command->{pid} = 0;
      $command->{down} = time + 2;
      logit "Command[" . pretty($command->{name}, 'bold') . "] exited (pid = $pid, " . stringify_exit_status($status) . ")";
      if ($status && !$killed && !$command->{restarting}) {
        # Unexpected exit
        logit "Command[" . pretty($command->{name}, 'bold') . "] " . pretty("failed", "red") . ", see logs for more details: $command->{logdir}/current";
      }
      $command->{restarting} = 0;
    } else {
      logit "ERR: Reaped unknown command (pid = $pid, " . stringify_exit_status($status) . ")";
    }
  }

  # Kill procs, maybe
  if ($killed) {
    my $should_killkill = time > $killkill;

    # Update stopping position, maybe
    if ($should_killkill) {
      $stopping = 0;
    } else {
      my $maxorder = 0;
      for my $command (grep { $_->{pid} } @commands) {
        if ($command->{order} > $maxorder) {
          $maxorder = $command->{order};
        }
      }

      if ($maxorder < $stopping) {
        $stopping = $maxorder;
      }
    }

    for my $command (grep { $_->{pid} && $_->{order} >= $stopping } @commands) {
      my $want_signal;
      if ($command->{killed} == 9 || $should_killkill) {
        $want_signal = 9;
      } else {
        $want_signal = 15;
      }

      if ($command->{killed} != $want_signal) {
        if ($want_signal != 9) {
          my $kt = $opt{'kill-timeout'};
          logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "] (timeout ${kt}s).";
        } else {
          logit "Sending signal[$want_signal] to command[" . pretty($command->{name}, 'bold') . "].";
        }
        kill $want_signal, $command->{pid} or logit "WARN: Could not signal pid: $command->{pid}";
        $command->{killed} = $want_signal;
      }
    }
  }

  # Kill ourselves, maybe
  if ($killed && ! grep { $_->{pid} } @commands) {
    logit "Exiting.";
    $SIG{HUP} = $SIG{INT} = $SIG{TERM} = 'DEFAULT';
    if ($killed > 0) {
      kill $killed, $$;
      exit 1;
    } else {
      # Normal exit
      exit 0;
    }
  }

  # Be controlled, maybe
  my $fifostr = "";
  if (sysread $fifofh, $fifostr, 4096) {
    $fifobuffer .= $fifostr;

    while ($fifobuffer =~ /^([^\n]*)\n(.*)/s) {
      my $fifocmd = $1;
      $fifobuffer = $2;
      if ($fifocmd =~ /^k (.+)$/ && !$killed) {
        my $name = $1;
        my ($command) = grep { $_->{name} eq $name && $_->{pid} } @commands;
        if ($command) {
          logit "Restarting command[" . pretty($name, "bold") . "].";
          if (kill TERM => $command->{pid}) {
            $command->{restarting} = 1;
          } else {
            logit "WARN: Could not signal pid: $command->{pid}"
          }
        } else {
          logit "Asked to restart unknown command[" . pretty($name, "bold") . "], ignoring.";
        }
      } elsif ($fifocmd eq 'd') {
        # -1 means exit without signal
        $killed = -1;
        $killkill = time + $opt{'kill-timeout'}
      } else {
        logit "Received unknown control command, ignoring.";
      }
    }
  }

  sleep 1;
}

exit 0;
