#!/usr/bin/perl -w

=head1 NAME

stdmetrics.pl -- collect and report standard Unix metrics

=head1 SYNOPSIS

C<stdmetrics.pl [--all | tests ] [ --opcmon | --no-opcmon ] [ --email | --no-email ] [ --anomaly-detection | --no-anomaly-detection] [ --disk-space-low | --no-disk-space-low ] [ --future-prediction | --no-future-prediction ] [ --record-only ] [ --datafile .... ] [--probability-multiplier ...] >

=head1 OPTIONS

=head2 REPORT CHOICE OPTIONS

=over

=item C<--anomaly-detection>

For each metric recorded, look at the number of standard deviations
away from the mean and report on usual values.

=item C<--no-anomaly-detection>

Don't report on standard deviations away from the mean.

=item C<--disk-space-low>

Report when a filesystem is running close (a few standard deviations)
to running out of space.

=item C<--no-disk-space-low>

Don't report on filesystem space.

=item C<--future-prediction>

For metrics which are a I<capacity> (see below), predict when it
will be exhausted.

=item C<--no-future-prediction>

Do not do capacity exhaustian prediction.

=item C<--record-only>

Do no reporting. Collect data only.

=back

=head2 METRIC CHOICE OPTIONS

The following options select what metrics are collected.

=over

=item C<--memory>

Collect the output from C<vmstat>. Works on HP-UX and Linux.

=item C<--disk>

Collect the output from C<bdf> or C<df>. Works on HP-UX, Linux and MacOS.

=item C<--sar>

Collect the output from C<sar>. Works on HP-UX.

=item C<--netstat> or C<--network>

Collect the output from C<netstat -in>. Works on HP-UX, Linux and MacOS.

=item C<--all> 

Collect all metrics.

=back

=head2 REPORTING METHOD OPTIONS

=over

=item C<--opcmon>  

This is the default if F</opt/OV/bin/OpC/opcmon> exists.

=item C<--no-opcmon>

Do not report to OVO even though F</opt/OV/bin/OpC/opcmon> exists.

=item C<--email>

Report via email. This has not yet been written.

=item C<--no-email>

Do not report via email. Currently does nothing.

=back


=head2 OTHER OPTIONS

=over

=item C<--datafile> ...

Use a specific datafile to record observations into, and to look at 
for historical data.

=item C<--probability-multiplier> ...

System metrics are often not Gaussian. Rather than do some intelligent
statistical processing to fix this, here's a fudge multiplier. If you
are getting ten times as many messages as you think you should, set
this to 10.

=back




=head1 DESCRIPTION

B<stdmetrics.pl> collects some standard Unix performance metrics, stores
them, and generates messages based on how statistically unusual the
measured value was.

It uses F<observe> to store and to calculate probabilities.

=head2 COLLECTED METRICS

Depending on its command-line options B<stdmetrics.pl> collects:

=over

=item CPU load average (from uptime)

=item Memory usage (from vmstat)

=item Network interface usage (from netstat -in)

=item All sar outputs:

=over

=item cpu activity

=item buffer activity

=item tty activity

=item syscalls activity

=item swap activity

=item fileaccess activity

=item queue activity

=item table activity

=item ipc activity

=back

=item Disk I/O activity for all disks

=back


=head2 CAPACITY METRICS

A few metrics are also trended (i.e. reports are generated as to when
the resource will run out).  These are:

=over

=item Disk space

=item Virtual memory free

=item Real memory free

=back

=head2 TIME BUCKETS

The local time zone determines the time buckets. There are currently 5
defined time buckets:

=over

=item B<weekend> -- Saturdays and Sundays

=item B<dawn> -- Monday to Friday, 7:00am to 8:59:59am

=item B<dusk> -- Monday to Friday, 5:00pm to 6:59:59pm

=item B<business-hours> -- Monday to Friday, 9:00am - 4:59:59pm

=item B<overnight> -- all other times.

=back


=head1 FILES

Unless overridden with the C<--datafile> argument or an environment
variable OBSERVE_DATAFILE, B<stdmetrics.pl> records each metric in its
own collection of time-based buckets.

If run as C<root> (as it usually would be), these buckets begin with
F</var/.observed>. If run as another user they will be stored in files
called F<.observed.*> in the user's home directory.

=head1 EXAMPLE

Most of the time, B<stdmetrics.pl> would simply be run like this:

C<stdmetrics.pl --all>

=cut

use strict;
use English;
use File::Basename;
my $observe = File::Basename::dirname($0) . "/observe";

if (! -f $observe) { 
    $observe = "/usr/local/bin/observe";
    if (! -f $observe) { die "Cannot find observe"; }
}

my @free_capacity_measures = qw{ram_free virtual_mem_free};

my (@current_time) = localtime(time);
my $hour = $current_time[2];
my $weekday = $current_time[7];

my $workshift;

if ($weekday == 6 || $weekday == 0) { $workshift = "weekend"; } 
elsif ($hour >= 7 && $hour < 9) { $workshift = "dawn"; }
elsif ($hour >= 9 && $hour < 17) { $workshift = "business-hours"; }
elsif ($hour >= 17 && $hour < 19) { $workshift = "dusk"; }
else { $workshift = "overnight"; }


use Getopt::Long;
my $do_cpu = 0;
my $do_memory = 0;
my $do_disk = 0;
my $do_sar = 0;
my $do_netstat = 0;
my $do_all = 0;
my $report_via_opcmon = -x "/opt/OV/bin/OpC/opcmon";
my $report_via_email = !$report_via_opcmon;
my $anomaly_detection = 1;
my $future_prediction = 1;
my $disk_space_low_test = 1;
my $probability_multiplier = 1.0;
my $debug_opcmon = 0;

GetOptions("cpu" => \$do_cpu,
	   "memory" => \$do_memory,
	   "disk" => \$do_disk,
	   "sar" => \$do_sar,
	   "netstat|network" => \$do_netstat,
	   "all" => \$do_all,
	   "opcmon!" =>  \$report_via_opcmon,
	   "debug_opcmon|debug-opcmon!" => \$debug_opcmon,
	   "email!" => \$report_via_email,
	   "anomaly-detection!" => \$anomaly_detection,
	   "disk-space-low!" => \$disk_space_low_test,
	   "future-prediction!" => \$future_prediction,
	   "record-only" => sub { $anomaly_detection = 0; $disk_space_low_test = 0; $future_prediction = 0; },
	   "datafile=s" => \$ENV{OBSERVE_DATAFILE},
	   "probability-multiplier=s" => \$probability_multiplier
    );


my $original_environment = exists $ENV{OBSERVE_DATAFILE} ? $ENV{OBSERVE_DATAFILE} : undef;



sub reset_observations_datafile {
    my $metric_class = shift;
    my $subfile;
    if ($metric_class eq 'disk') { 
	my $metric = shift;
	$metric =~ s:/:.:g;
	$subfile = $metric;
    } elsif ($metric_class =~ /^device/) {
	my $metric = shift;
	$metric =~ s:/:.:g;
	$subfile = ".$workshift.$metric_class.$metric";
    } else { 
	$subfile = ".$workshift.$metric_class";
    }


    if (defined $original_environment) {
	$ENV{OBSERVE_DATAFILE} = $original_environment;
    } elsif ($> == 0 || -w "/var/.observed$subfile") {
	$ENV{OBSERVE_DATAFILE} = "/var/.observed$subfile";
    } else {
	$ENV{OBSERVE_DATAFILE} = $ENV{HOME}."/.observed$subfile";
    }
    #print STDERR "Storing in $ENV{OBSERVE_DATAFILE}\n";
}


sub stats {
    my $metric = shift;
    my %stats;
    $stats{"count"} = 0;
    $stats{"stddev"} = 0;
    open(STATS,"$observe stats $metric 2>/dev/null|") || die "$!";
    my $line;
    while ($line = <STATS>) {
	if ($line =~ /(.*)\(.*\) = ([\d.e-]+)$/i) {
	    $stats{$1} = $2;
	}
    }
    close(STATS);
    return \%stats;
}

sub days_until_no_capacity {
    my $metric = shift;
    open(PREDICTION,"$observe predict $metric=0 |") || die "$!";
    my $line;
    while ($line = <PREDICTION>) {
	if ($line =~ /That is ([\d.-]+) days from now/) {
	    close(PREDICTION);
	    return $1;
	}
    }
    close(PREDICTION);
    return undef;
}

sub probability {
    my $metric = shift;
    my $observation = shift;
    open(PROBABILITY,"$observe probability $metric=$observation |")
	|| die "$!";
    my $first = <PROBABILITY>;
    my $second = <PROBABILITY>;
    if ($second =~ /([\d.]+)% chance event/) {
	close (PROBABILITY);
	return $1 * $probability_multiplier;
    }
    close(PROBABILITY);
    return undef;
}


sub trend  {
    my $metric = shift;
    open(TREND,"$observe trend $metric |") || die "$!";
    my %slope;
    while (<TREND>) {
	next unless /is increasing at ([\d.-]+) units per (\w+)/;
	$slope{$2} = $1;
    }
    close(TREND);
    return \%slope;
}

sub opcmon {
    system("/opt/OV/bin/OpC/opcmon",@_) if $report_via_opcmon;
    print STDERR join(" ","/opt/OV/bin/OpC/opcmon",@_,"\n") if $debug_opcmon;
}

sub report {
    my $metric_class = shift;
    my $data = shift;
    my $depletion;
    my $metric;
    my $observation;
    my $gradients;
    my $maximum_absurdity = 5;
    # If there is enough uncertainty in the data (i.e. regress time
    # against storage gives different answers to storage against
    # time), then we can get silly messages like this:
    # /var has 2 days capacity! Currently at 90%, dropping 0.1% per day
    # $maximum_absurdity says don't bother sending trend data if
    # the disagreement is over this factor.
    my $trendable = 0;

    return unless $disk_space_low_test || $anomaly_detection || $future_prediction;

METRIC:
    foreach $metric (keys %$data) {
	&reset_observations_datafile($metric_class,$metric);
	my $observation = $data->{$metric};
	my $stats = &stats($metric);
	if ($stats->{"stddev"} == 0.0) { 
	    #print STDERR "Insufficient data for $metric =". ($stats->{"stddev"}). "...\n"; 
	    next METRIC; 
	}
	my @stats_opts =
	    ("-option","mean=".$stats->{"mean"},
	     "-option","stddev=".$stats->{"stddev"},
	     "-option","observation=$observation",
	     "-option","threestddevs=".3*$stats->{"stddev"});
	my @trend_opts;

	my $probability = &probability($metric,$observation);
	if ($future_prediction && 
	    ($metric_class eq 'disk' || 
	    grep($_ eq $metric,@free_capacity_measures))) {
	    $depletion = &days_until_no_capacity($metric);
	    $gradients = &trend($metric);
	    if ($depletion * $gradients->{"day"} * $maximum_absurdity >
		$observation) {
		$trendable = 1;
		@trend_opts = (
		    "-object",$metric,
		    "-option","observation=$observation",
		    "-option","daily=".(-$gradients->{"day"}),
		    "-option","weekly=".(-$gradients->{"week"})
		    );
	    } else {
		$trendable = 0;
	    }
	}

	if ($metric_class eq 'disk') {
	    # Are we low on space?
	    $probability = &probability($metric,3*$stats->{"stddev"});
	    opcmon("disk=$probability","-object",$metric,@stats_opts)
		if $disk_space_low_test;
	    opcmon("days_of_diskspace_capacity_left=$depletion",@trend_opts)
		if $trendable;
	} elsif ($metric_class =~ /^device/) {
	    opcmon("$metric_class=$probability","-object",$metric,@stats_opts)
		if $anomaly_detection;
	} elsif ($metric_class =~ /^net-(.*)/) {
	    opcmon("$metric=$probability","-object",$1,@stats_opts)
		if $anomaly_detection;
	} else {
	    opcmon("$metric=$probability",@stats_opts)
		if $anomaly_detection;
	    opcmon("days_of_${metric}_capacity_left=$depletion",@trend_opts)
		if $trendable;
	}
    }
}









######################################################################


sub disk_usage {
    my $rare = shift;
    my $tag = shift;
    my $df = "df -kl";
    if ($OSNAME eq 'hpux') {
	$df = "bdf -l";
    }
    open(DF,"$df|") || die "$!";
    my $firstline = <DF>;
    my %data;
    my ($volume,$kbytes,$used,$avail,$capacity,$mountpoint);
    while (<DF>) {
	chomp;
	my @fields = split;
	if ($#fields < 5) {
	    $_ .= <DF>;
	    chomp;
	    @fields = split;
	}
	$capacity = $fields[$#fields-1];
	$mountpoint = $fields[$#fields];
	$capacity =~ s/%//;
	my $capacity_free = 100 - $capacity;
	$data{$mountpoint} = $capacity_free;
	
	&reset_observations_datafile('disk',$mountpoint);
	system($observe,"record","$mountpoint=$capacity_free");
	push(@free_capacity_measures,$mountpoint);
    }
    &report('disk',\%data);
}


sub one_minute_load_average {
    open(UPTIME,"uptime|") || die "$!";
    my $line = <UPTIME>;
    my %data;
    &reset_observations_datafile('cpu');
    if ($line =~ /.*load averages?: ([\d.]+)[ ,]/) {
	system($observe,"record","load_average=$1");
	$data{"load_average"}= $1;
    }
    close(UPTIME);
    &report("cpu",\%data);
}

sub sar {
    my %sars = ('u' => 'sar-cpu', 'b' => 'sar-buffer',
		'y' => 'sar-tty', 'c' => 'sar-syscalls',
		'w' => 'sar-swap', 'a' => 'sar-fileaccess',
		'q' => 'sar-queue', 'v' => 'sar-tables',
		'm' => 'sar-ipc' );
    # Should also do disk activity. But it's a bit trickier.
    my $option;
    if ($OSNAME ne 'hpux') { return; } # will fix for linux later
    # This whole function should be rewritten using AsciiTableReader anyway.
    foreach $option (keys %sars) {
	&reset_observations_datafile($sars{$option});
	open(SAR,"sar -$option 1 1 |") || die "can't run sar -$option";
	my $last_line;
	my $next_to_last_line;
	while (<SAR>) {
	    $next_to_last_line = $last_line;
	    $last_line = $_;
	}
	chomp $next_to_last_line;
	chomp $last_line;
	close(SAR);
	my @args;
	my @last_line_fields = split(/ +/,$last_line);
	my @next_to_last_line_fields = split(/ +/,$next_to_last_line);
	my $i;
	my $field_name;
	my %data;

	# yes, skip over first field which just shows the time
      COLUMN:
	for($i=1;$i<=$#next_to_last_line_fields;$i++) {
	    next COLUMN unless $last_line_fields[$i] =~ /^[\d.]+$/;
	    $field_name = $next_to_last_line_fields[$i];
	    if ($field_name eq 'ov') { $field_name = $next_to_last_line_fields[$i-1]."-overflows";
	    }
	    $field_name =~ s:/s$:-per-sec:;
	    $field_name =~ s/^%(.*)/$1-percent/;
	    $data{$field_name} = $last_line_fields[$i];
	    #print STDERR "\$data{$field_name} = $last_line_fields[$i]\n";
	    push(@args,"$field_name=$last_line_fields[$i]");
	}
	system($observe,"record",@args);
	&report($sars{$option},\%data);
    }

    # Now, disks.
    my %data;
    open(SAR,"sar -d 1 1|") || die "Can't run sar -d to get disk stats";
    my $junk;
    $junk = <SAR>; return unless $junk =~ /^\s*$/; # blank line
    $junk = <SAR>; # OS header details
    $junk = <SAR>; return unless $junk =~ /^\s*$/; # another blank line
    $junk = <SAR>; return unless
        ($junk =~ /device/) && ($junk =~ /%busy/) && ($junk =~ /avwait/);
    #my $first_line = 1;
    my $line;
    my @fields;
    while($line = <SAR>) {
	chomp $line;
	@fields = split(/ +/,$line);
	#print STDERR ".. $line\n";
	shift(@fields);
	#if ($first_line) { shift(@fields); $first_line = 0; }
	#print STDERR "..".join("; ",@fields)."\n";
	my $device = $fields[0];


	my %titles = (1 => "device-busy-percent", 
		      2 => "device-average-queue",
		      3 => "device-reads-and-writes-per-sec",
		      4 => "device-blocks-per-sec",
		      5 => "device-average-wait-time",
		      6 => "device-average-service-time");
	my $column;
	foreach $column (keys %titles) {
	    my $title = $titles{$column};
            my %data;
	    $data{$device} = $fields[$column];
	    &reset_observations_datafile($title,$device);
	    system("$observe","record","$device=$data{$device}");
	    #print STDERR "$title: $device=$data{$device}\n";
	    &report($title,\%data);
	}
    }
}


sub memory {
    my $free = "free -tm";
    if ($OSNAME eq 'hpux') {
	$free = "swapinfo -tm";
    } elsif ($OSNAME eq 'darwin') { 
	return; 
	# for now, don't know how what to do with MacOS
    }
    open(FREE,"$free|") || die "$!";
    my $line = <FREE>;
FIND_MEM_LINE:
    while ($line !~ /^Mem/i) { 
	$line = <FREE>; 
	if ($line =~ /total/) {
	    # Ahh, must be a swapmem_off machine
	    last FIND_MEM_LINE;
	}
    }
    my %data;
    &reset_observations_datafile('memory');
    if ($line =~ /^mem[^\d](\d+)\s+(\d+)\s+(\d+)/) {
	$data{"physical_ram"} = $1;
	$data{"ram_used"} = $2;
	$data{"ram_free"} = $3;
	system($observe,"record","physical_ram=$data{physical_ram}",
	       "ram_used=$data{ram_used}","ram_free=$data{ram_free}");
    }
    while ($line !~ /^Total/i) { $line = <FREE>; }
    $line =~ /total:?\s+(\d+)\s+(\d+)\s+(\d+)/;
    $data{"virtual_mem_total"} = $1;
    $data{"virtual_mem_used"} = $2;
    $data{"virtual_mem_free"} = $3;
    system($observe,"record",
	   "virtual_mem_total=$data{virtual_mem_total}",
	   "virtual_mem_used=$data{virtual_mem_used}",
	   "virtual_mem_free=$data{virtual_mem_free}");
    &report("memory",\%data);
} 



sub netstat_structure {
    open(NETSTAT,"netstat -in|") || die "couldn't run netstat";
    my (@netstat_lines) = <NETSTAT>;
    close(NETSTAT);
    my $header;
    my @content_lines;
    my $row_leader_name;
    if ($OSNAME =~ /linux/i) {
	my $junk;
	($junk,$header,@content_lines) = @netstat_lines;
	@content_lines = grep ($_ !~ /no statistics available/,@content_lines);
	$row_leader_name = 'Iface';
    } elsif ($OSNAME =~ /darwin/i) {
	($header,@content_lines) = @netstat_lines;
	@content_lines = grep(/Link/,@content_lines);
	@content_lines = grep($_ !~ /^fw0/,@content_lines);
	$row_leader_name = 'Name';
    } elsif ($OSNAME =~ /hpux/i) {
	($header,@content_lines) = @netstat_lines;
	$row_leader_name = 'Name';
    }
    chomp($header);
    chomp(@content_lines);
    my $answer = AsciiTableReader::from_lines($header,$row_leader_name,
					      @content_lines);
    return $answer;
}

sub traffic_rate {
    my $first_netstat = &netstat_structure();
    sleep 1;
    my $second_netstat = &netstat_structure();
    my $interface;
    foreach $interface (keys %$first_netstat) {
	&reset_observations_datafile($interface);
	my %data;
	my @args;
	my $k;
	foreach $k (keys %{$first_netstat->{$interface}}) {
	    my $old = $first_netstat->{$interface}->{$k};
	    my $now = $second_netstat->{$interface}->{$k};
	    if (defined $old && defined $now && 
		$old =~ /^\d+$/ && $now =~ /^\d+$/) {
		my $difference = $now - $old;
		$data{$k} = $difference;
		push(@args,"$k=$difference");
	    }
	}
	system($observe,"record",@args);
	&report("net-$interface",\%data);
    }
}


package AsciiTableReader;

sub new {
    my $class = shift;
    my $self = {};
    my $header_line = shift;
    bless $self,$class;
    $self->calculate_headers_and_positions($header_line);
    return $self;
}

sub from_lines {
    my $header = shift;
    my $row_leader_name = shift;
    my $atr = new AsciiTableReader($header);
    my @raw_lines = map($atr->readline($_),@_);
    my @keyed_lines = map {$_->{$row_leader_name} => $_} @raw_lines;
    my %hash = @keyed_lines;
    return \%hash;
}

sub calculate_headers_and_positions {
    my $self = shift;
    my $header_line = shift;
    my @fields = split(/\s+/,$header_line);
    if ($fields[0] =~ /^\s*$/) { shift @fields; }
    my $word;
    my $i;
    $self->{"starts"} = {};
    $self->{"ends"} = {};
    $i = 0;
    foreach $word (@fields) {
	my $start = index($header_line,$word,$i);
	die "index($header_line,$word,$i) = $start" if $start == -1;
	my $end = $start -1 + length $word;
	$self->{"starts"}->{$word} = $start;
	$self->{"ends"}->{$word} = $end;
	$i = $self->{"ends"}->{$word} + 1;
    }
}

sub most_likely_header {
    my $self = shift;
    my $word_start = shift;
    my $word_end = shift;
    my $best_header = undef;
    my $best_score = undef;
    my $header;
    foreach $header (keys %{$self->{"starts"}}) {
	my $start = $self->{"starts"}->{$header};
	my $end = $self->{"ends"}->{$header};
	if ($start == $word_start) { return $header; }
	if ($end == $word_end) { return $header; }
	my $start_score = abs($start - $word_start);
	my $end_score = abs($end - $word_end);
	my $score = $start_score < $end_score ? $start_score : $end_score;
	if (!defined($best_score) || $score < $best_score) { 
	    $best_score = $score; 
	    $best_header = $header; 
	}
    }
    return $best_header;
}

sub readline {
    my $self = shift;
    my $line = shift;
    return undef if $line =~ /^\s*[-=_]*\s*$/;
    my @words = split(/\s+/,$line);
    if ($words[0] =~ /^\s*$/) { shift @words; }
    my $word;
    my $i = 0;
    my %answer;
  WORD:
    foreach $word (@words) {
	my $start = index($line,$word,$i);
	my $end = $start -1 + length $word;
	my $header = $self->most_likely_header($start,$end);
	#print STDERR "In line $line, word $word (start $start end $end) seemed best to be $header\n";
	$answer{$header} = $word;
	$i = $end + 1;
    }
    return \%answer;
}

package main;


my $arg;

&one_minute_load_average() if $do_cpu || $do_all;
&memory() if $do_netstat || $do_all;
&disk_usage() if $do_disk || $do_all;
&sar() if $do_sar || $do_all;
&traffic_rate() if $do_netstat || $do_all;
