#!/usr/bin/perl -w =head1 NAME stdmetrics.pl -- collect and report standard Unix metrics =head1 SYNOPSIS C =head1 OPTIONS =head2 REPORT CHOICE OPTIONS =over =item C<--anomaly-detection> For each metric recorded, look at the number of standard deviations away from the mean and report on usual values. =item C<--no-anomaly-detection> Don't report on standard deviations away from the mean. =item C<--disk-space-low> Report when a filesystem is running close (a few standard deviations) to running out of space. =item C<--no-disk-space-low> Don't report on filesystem space. =item C<--future-prediction> For metrics which are a I (see below), predict when it will be exhausted. =item C<--no-future-prediction> Do not do capacity exhaustian prediction. =item C<--record-only> Do no reporting. Collect data only. =back =head2 METRIC CHOICE OPTIONS The following options select what metrics are collected. =over =item C<--memory> Collect the output from C. Works on HP-UX and Linux. =item C<--disk> Collect the output from C or C. Works on HP-UX, Linux and MacOS. =item C<--sar> Collect the output from C. Works on HP-UX. =item C<--netstat> or C<--network> Collect the output from C. Works on HP-UX, Linux and MacOS. =item C<--all> Collect all metrics. =back =head2 REPORTING METHOD OPTIONS =over =item C<--opcmon> This is the default if F exists. =item C<--no-opcmon> Do not report to OVO even though F exists. =item C<--email> Report via email. This has not yet been written. =item C<--no-email> Do not report via email. Currently does nothing. =back =head2 OTHER OPTIONS =over =item C<--datafile> ... Use a specific datafile to record observations into, and to look at for historical data. =item C<--probability-multiplier> ... System metrics are often not Gaussian. Rather than do some intelligent statistical processing to fix this, here's a fudge multiplier. If you are getting ten times as many messages as you think you should, set this to 10. =back =head1 DESCRIPTION B collects some standard Unix performance metrics, stores them, and generates messages based on how statistically unusual the measured value was. It uses F to store and to calculate probabilities. =head2 COLLECTED METRICS Depending on its command-line options B collects: =over =item CPU load average (from uptime) =item Memory usage (from vmstat) =item Network interface usage (from netstat -in) =item All sar outputs: =over =item cpu activity =item buffer activity =item tty activity =item syscalls activity =item swap activity =item fileaccess activity =item queue activity =item table activity =item ipc activity =back =item Disk I/O activity for all disks =back =head2 CAPACITY METRICS A few metrics are also trended (i.e. reports are generated as to when the resource will run out). These are: =over =item Disk space =item Virtual memory free =item Real memory free =back =head2 TIME BUCKETS The local time zone determines the time buckets. There are currently 5 defined time buckets: =over =item B -- Saturdays and Sundays =item B -- Monday to Friday, 7:00am to 8:59:59am =item B -- Monday to Friday, 5:00pm to 6:59:59pm =item B -- Monday to Friday, 9:00am - 4:59:59pm =item B -- all other times. =back =head1 FILES Unless overridden with the C<--datafile> argument or an environment variable OBSERVE_DATAFILE, B records each metric in its own collection of time-based buckets. If run as C (as it usually would be), these buckets begin with F. If run as another user they will be stored in files called F<.observed.*> in the user's home directory. =head1 EXAMPLE Most of the time, B would simply be run like this: C =cut use strict; use English; use File::Basename; my $observe = File::Basename::dirname($0) . "/observe"; if (! -f $observe) { $observe = "/usr/local/bin/observe"; if (! -f $observe) { die "Cannot find observe"; } } my @free_capacity_measures = qw{ram_free virtual_mem_free}; my (@current_time) = localtime(time); my $hour = $current_time[2]; my $weekday = $current_time[7]; my $workshift; if ($weekday == 6 || $weekday == 0) { $workshift = "weekend"; } elsif ($hour >= 7 && $hour < 9) { $workshift = "dawn"; } elsif ($hour >= 9 && $hour < 17) { $workshift = "business-hours"; } elsif ($hour >= 17 && $hour < 19) { $workshift = "dusk"; } else { $workshift = "overnight"; } use Getopt::Long; my $do_cpu = 0; my $do_memory = 0; my $do_disk = 0; my $do_sar = 0; my $do_netstat = 0; my $do_all = 0; my $report_via_opcmon = -x "/opt/OV/bin/OpC/opcmon"; my $report_via_email = !$report_via_opcmon; my $anomaly_detection = 1; my $future_prediction = 1; my $disk_space_low_test = 1; my $probability_multiplier = 1.0; my $debug_opcmon = 0; GetOptions("cpu" => \$do_cpu, "memory" => \$do_memory, "disk" => \$do_disk, "sar" => \$do_sar, "netstat|network" => \$do_netstat, "all" => \$do_all, "opcmon!" => \$report_via_opcmon, "debug_opcmon|debug-opcmon!" => \$debug_opcmon, "email!" => \$report_via_email, "anomaly-detection!" => \$anomaly_detection, "disk-space-low!" => \$disk_space_low_test, "future-prediction!" => \$future_prediction, "record-only" => sub { $anomaly_detection = 0; $disk_space_low_test = 0; $future_prediction = 0; }, "datafile=s" => \$ENV{OBSERVE_DATAFILE}, "probability-multiplier=s" => \$probability_multiplier ); my $original_environment = exists $ENV{OBSERVE_DATAFILE} ? $ENV{OBSERVE_DATAFILE} : undef; sub reset_observations_datafile { my $metric_class = shift; my $subfile; if ($metric_class eq 'disk') { my $metric = shift; $metric =~ s:/:.:g; $subfile = $metric; } elsif ($metric_class =~ /^device/) { my $metric = shift; $metric =~ s:/:.:g; $subfile = ".$workshift.$metric_class.$metric"; } else { $subfile = ".$workshift.$metric_class"; } if (defined $original_environment) { $ENV{OBSERVE_DATAFILE} = $original_environment; } elsif ($> == 0 || -w "/var/.observed$subfile") { $ENV{OBSERVE_DATAFILE} = "/var/.observed$subfile"; } else { $ENV{OBSERVE_DATAFILE} = $ENV{HOME}."/.observed$subfile"; } #print STDERR "Storing in $ENV{OBSERVE_DATAFILE}\n"; } sub stats { my $metric = shift; my %stats; $stats{"count"} = 0; $stats{"stddev"} = 0; open(STATS,"$observe stats $metric 2>/dev/null|") || die "$!"; my $line; while ($line = ) { if ($line =~ /(.*)$.*$ = ([\d.e-]+)$/i) { $stats{$1} = $2; } } close(STATS); return \%stats; } sub days_until_no_capacity { my $metric = shift; open(PREDICTION,"$observe predict $metric=0 |") || die "$!"; my $line; while ($line = ) { if ($line =~ /That is ([\d.-]+) days from now/) { close(PREDICTION); return $1; } } close(PREDICTION); return undef; } sub probability { my $metric = shift; my $observation = shift; open(PROBABILITY,"$observe probability $metric=$observation |") || die "$!"; my $first = ; my $second = ; if ($second =~ /([\d.]+)% chance event/) { close (PROBABILITY); return $1 * $probability_multiplier; } close(PROBABILITY); return undef; } sub trend { my $metric = shift; open(TREND,"$observe trend $metric |") || die "$!"; my %slope; while () { next unless /is increasing at ([\d.-]+) units per (\w+)/; $slope{$2} = $1; } close(TREND); return \%slope; } sub opcmon { system("/opt/OV/bin/OpC/opcmon",@_) if $report_via_opcmon; print STDERR join(" ","/opt/OV/bin/OpC/opcmon",@_,"\n") if $debug_opcmon; } sub report { my $metric_class = shift; my $data = shift; my $depletion; my $metric; my $observation; my $gradients; my $maximum_absurdity = 5; # If there is enough uncertainty in the data (i.e. regress time # against storage gives different answers to storage against # time), then we can get silly messages like this: # /var has 2 days capacity! Currently at 90%, dropping 0.1% per day # $maximum_absurdity says don't bother sending trend data if # the disagreement is over this factor. my $trendable = 0; return unless $disk_space_low_test || $anomaly_detection || $future_prediction; METRIC: foreach $metric (keys %$data) { &reset_observations_datafile($metric_class,$metric); my $observation = $data->{$metric}; my $stats = &stats($metric); if ($stats->{"stddev"} == 0.0) { #print STDERR "Insufficient data for $metric =". ($stats->{"stddev"}). "...\n"; next METRIC; } my @stats_opts = ("-option","mean=".$stats->{"mean"}, "-option","stddev=".$stats->{"stddev"}, "-option","observation=$observation", "-option","threestddevs=".3*$stats->{"stddev"}); my @trend_opts; my $probability = &probability($metric,$observation); if ($future_prediction && ($metric_class eq 'disk' || grep($_ eq $metric,@free_capacity_measures))) { $depletion = &days_until_no_capacity($metric); $gradients = &trend($metric); if ($depletion * $gradients->{"day"} * $maximum_absurdity > $observation) { $trendable = 1; @trend_opts = ( "-object",$metric, "-option","observation=$observation", "-option","daily=".(-$gradients->{"day"}), "-option","weekly=".(-$gradients->{"week"}) ); } else { $trendable = 0; } } if ($metric_class eq 'disk') { # Are we low on space? $probability = &probability($metric,3*$stats->{"stddev"}); opcmon("disk=$probability","-object",$metric,@stats_opts) if $disk_space_low_test; opcmon("days_of_diskspace_capacity_left=$depletion",@trend_opts) if $trendable; } elsif ($metric_class =~ /^device/) { opcmon("$metric_class=$probability","-object",$metric,@stats_opts) if $anomaly_detection; } elsif ($metric_class =~ /^net-(.*)/) { opcmon("$metric=$probability","-object",$1,@stats_opts) if $anomaly_detection; } else { opcmon("$metric=$probability",@stats_opts) if $anomaly_detection; opcmon("days_of_${metric}_capacity_left=$depletion",@trend_opts) if $trendable; } } } ###################################################################### sub disk_usage { my $rare = shift; my $tag = shift; my $df = "df -kl"; if ($OSNAME eq 'hpux') { $df = "bdf -l"; } open(DF,"$df|") || die "$!"; my $firstline = ; my %data; my ($volume,$kbytes,$used,$avail,$capacity,$mountpoint); while () { chomp; my @fields = split; if ($#fields < 5) { $_ .= ; chomp; @fields = split; } $capacity = $fields[$#fields-1]; $mountpoint = $fields[$#fields]; $capacity =~ s/%//; my $capacity_free = 100 - $capacity; $data{$mountpoint} = $capacity_free; &reset_observations_datafile('disk',$mountpoint); system($observe,"record","$mountpoint=$capacity_free"); push(@free_capacity_measures,$mountpoint); } &report('disk',\%data); } sub one_minute_load_average { open(UPTIME,"uptime|") || die "$!"; my $line = ; my %data; &reset_observations_datafile('cpu'); if ($line =~ /.*load averages?: ([\d.]+)[ ,]/) { system($observe,"record","load_average=$1"); $data{"load_average"}= $1; } close(UPTIME); &report("cpu",\%data); } sub sar { my %sars = ('u' => 'sar-cpu', 'b' => 'sar-buffer', 'y' => 'sar-tty', 'c' => 'sar-syscalls', 'w' => 'sar-swap', 'a' => 'sar-fileaccess', 'q' => 'sar-queue', 'v' => 'sar-tables', 'm' => 'sar-ipc' ); # Should also do disk activity. But it's a bit trickier. my $option; if ($OSNAME ne 'hpux') { return; } # will fix for linux later # This whole function should be rewritten using AsciiTableReader anyway. foreach $option (keys %sars) { &reset_observations_datafile($sars{$option}); open(SAR,"sar -$option 1 1 |") || die "can't run sar -$option"; my $last_line; my $next_to_last_line; while () { $next_to_last_line = $last_line; $last_line = $_; } chomp $next_to_last_line; chomp $last_line; close(SAR); my @args; my @last_line_fields = split(/ +/,$last_line); my @next_to_last_line_fields = split(/ +/,$next_to_last_line); my $i; my $field_name; my %data; # yes, skip over first field which just shows the time COLUMN: for($i=1;$i<=$#next_to_last_line_fields;$i++) { next COLUMN unless $last_line_fields[$i] =~ /^[\d.]+$/; $field_name = $next_to_last_line_fields[$i]; if ($field_name eq 'ov') { $field_name = $next_to_last_line_fields[$i-1]."-overflows"; } $field_name =~ s:/s$:-per-sec:; $field_name =~ s/^%(.*)/$1-percent/; $data{$field_name} = $last_line_fields[$i]; #print STDERR "\$data{$field_name} = $last_line_fields[$i]\n"; push(@args,"$field_name=$last_line_fields[$i]"); } system($observe,"record",@args); &report($sars{$option},\%data); } # Now, disks. my %data; open(SAR,"sar -d 1 1|") || die "Can't run sar -d to get disk stats"; my $junk; $junk = ; return unless $junk =~ /^\s*$/; # blank line $junk = ; # OS header details $junk = ; return unless $junk =~ /^\s*$/; # another blank line $junk = ; return unless ($junk =~ /device/) && ($junk =~ /%busy/) && ($junk =~ /avwait/); #my $first_line = 1; my $line; my @fields; while($line = ) { chomp $line; @fields = split(/ +/,$line); #print STDERR ".. $line\n"; shift(@fields); #if ($first_line) { shift(@fields); $first_line = 0; } #print STDERR "..".join("; ",@fields)."\n"; my $device = $fields[0]; my %titles = (1 => "device-busy-percent", 2 => "device-average-queue", 3 => "device-reads-and-writes-per-sec", 4 => "device-blocks-per-sec", 5 => "device-average-wait-time", 6 => "device-average-service-time"); my $column; foreach $column (keys %titles) { my $title = $titles{$column}; my %data; $data{$device} = $fields[$column]; &reset_observations_datafile($title,$device); system("$observe","record","$device=$data{$device}"); #print STDERR "$title: $device=$data{$device}\n"; &report($title,\%data); } } } sub memory { my $free = "free -tm"; if ($OSNAME eq 'hpux') { $free = "swapinfo -tm"; } elsif ($OSNAME eq 'darwin') { return; # for now, don't know how what to do with MacOS } open(FREE,"$free|") || die "$!"; my $line = ; FIND_MEM_LINE: while ($line !~ /^Mem/i) { $line = ; if ($line =~ /total/) { # Ahh, must be a swapmem_off machine last FIND_MEM_LINE; } } my %data; &reset_observations_datafile('memory'); if ($line =~ /^mem[^\d](\d+)\s+(\d+)\s+(\d+)/) { $data{"physical_ram"} = $1; $data{"ram_used"} = $2; $data{"ram_free"} = $3; system($observe,"record","physical_ram=$data{physical_ram}", "ram_used=$data{ram_used}","ram_free=$data{ram_free}"); } while ($line !~ /^Total/i) { $line = ; } $line =~ /total:?\s+(\d+)\s+(\d+)\s+(\d+)/; $data{"virtual_mem_total"} = $1; $data{"virtual_mem_used"} = $2; $data{"virtual_mem_free"} = $3; system($observe,"record", "virtual_mem_total=$data{virtual_mem_total}", "virtual_mem_used=$data{virtual_mem_used}", "virtual_mem_free=$data{virtual_mem_free}"); &report("memory",\%data); } sub netstat_structure { open(NETSTAT,"netstat -in|") || die "couldn't run netstat"; my (@netstat_lines) = ; close(NETSTAT); my $header; my @content_lines; my $row_leader_name; if ($OSNAME =~ /linux/i) { my $junk; ($junk,$header,@content_lines) = @netstat_lines; @content_lines = grep ($_ !~ /no statistics available/,@content_lines); $row_leader_name = 'Iface'; } elsif ($OSNAME =~ /darwin/i) { ($header,@content_lines) = @netstat_lines; @content_lines = grep(/Link/,@content_lines); @content_lines = grep($_ !~ /^fw0/,@content_lines); $row_leader_name = 'Name'; } elsif ($OSNAME =~ /hpux/i) { ($header,@content_lines) = @netstat_lines; $row_leader_name = 'Name'; } chomp($header); chomp(@content_lines); my $answer = AsciiTableReader::from_lines($header,$row_leader_name, @content_lines); return $answer; } sub traffic_rate { my $first_netstat = &netstat_structure(); sleep 1; my $second_netstat = &netstat_structure(); my $interface; foreach $interface (keys %$first_netstat) { &reset_observations_datafile($interface); my %data; my @args; my $k; foreach $k (keys %{$first_netstat->{$interface}}) { my $old = $first_netstat->{$interface}->{$k}; my $now = $second_netstat->{$interface}->{$k}; if (defined $old && defined $now && $old =~ /^\d+$/ && $now =~ /^\d+$/) { my $difference = $now - $old; $data{$k} = $difference; push(@args,"$k=$difference"); } } system($observe,"record",@args); &report("net-$interface",\%data); } } package AsciiTableReader; sub new { my $class = shift; my $self = {}; my $header_line = shift; bless $self,$class; $self->calculate_headers_and_positions($header_line); return $self; } sub from_lines { my $header = shift; my $row_leader_name = shift; my $atr = new AsciiTableReader($header); my @raw_lines = map($atr->readline($_),@_); my @keyed_lines = map {$_->{$row_leader_name} => $_} @raw_lines; my %hash = @keyed_lines; return \%hash; } sub calculate_headers_and_positions { my $self = shift; my $header_line = shift; my @fields = split(/\s+/,$header_line); if ($fields[0] =~ /^\s*$/) { shift @fields; } my $word; my $i; $self->{"starts"} = {}; $self->{"ends"} = {}; $i = 0; foreach $word (@fields) { my $start = index($header_line,$word,$i); die "index($header_line,$word,$i) = $start" if $start == -1; my $end = $start -1 + length $word; $self->{"starts"}->{$word} = $start; $self->{"ends"}->{$word} = $end; $i = $self->{"ends"}->{$word} + 1; } } sub most_likely_header { my $self = shift; my $word_start = shift; my $word_end = shift; my $best_header = undef; my $best_score = undef; my $header; foreach $header (keys %{$self->{"starts"}}) { my $start = $self->{"starts"}->{$header}; my $end = $self->{"ends"}->{$header}; if ($start == $word_start) { return $header; } if ($end == $word_end) { return $header; } my $start_score = abs($start - $word_start); my $end_score = abs($end - $word_end); my $score = $start_score < $end_score ? $start_score : $end_score; if (!defined($best_score) || $score < $best_score) { $best_score = $score; $best_header = $header; } } return $best_header; } sub readline { my $self = shift; my $line = shift; return undef if $line =~ /^\s*[-=_]*\s*$/; my @words = split(/\s+/,$line); if ($words[0] =~ /^\s*$/) { shift @words; } my $word; my $i = 0; my %answer; WORD: foreach $word (@words) { my $start = index($line,$word,$i); my $end = $start -1 + length $word; my $header = $self->most_likely_header($start,$end); #print STDERR "In line $line, word $word (start $start end $end) seemed best to be $header\n"; $answer{$header} = $word; $i = $end + 1; } return \%answer; } package main; my $arg; &one_minute_load_average() if $do_cpu || $do_all; &memory() if $do_netstat || $do_all; &disk_usage() if $do_disk || $do_all; &sar() if $do_sar || $do_all; &traffic_rate() if $do_netstat || $do_all;