Modified cache-summary.pl for squid 1.0 (new log format)

From: Stewart Forster <slf@dont-contact.us>
Date: Tue, 29 Oct 1996 09:24:33 +1100

Hi,

        We had troubles with cache-summary.pl taking HOURS to get through
processing the results. I did some digging and found it was spending almost
ALL of its time in doing the sort()'s for the top N stuff. It was taking over
an hour each for the top UDP and top TCP sorts. I've made some mods which
brought this down to about 5 seconds (for us anyway) for each top N sort
(seriously!). I don't know what PERL uses for a sort(), but I just implemented
a simple insertion sort into an array of N, and processing time for sorting
dropped by a factor of 1000!

        Also, the 1.0 cache-summary.pl doesn't count UDP_HIT_OBJ and
TCP_IFMODSINCE as savings, and doesn't count TCP_REFRESH as misses, so I've
added these to the sums. We found that our cache savings figures jumped by
13% (from 45% to 58%) when we started counting what all the savings we were
REALLY getting.

        Anyway, the script follows. I hope you find it useful, and maybe
the squid programmers can incorporate the sorting speed increases at least
into the newer versions of squid (if they haven't done so already).

        Cheers,

                Stew.

----------------------- cut here ----------------------
#!/opt/local/bin/perl
# cache-summary.pl with changes: Stewart Forster 28/10/96
$ENV{'TZ'} = 'Australia/NSW';
require 'timelocal.pl';
require 'getopts.pl';
$usage = "$0: -o -w window -n # -l logfile\n";

%Month = ('Jan', 0, 'Feb', 1, 'Mar', 2, 'Apr', 3, 'May', 4, 'Jun', 5,
          'Jul', 6, 'Aug', 7, 'Sep', 8, 'Oct', 9, 'Nov', 10, 'Dec', 11);

# Defaults
$logfile = '/usr/local/www/cache/access.log';
$window = 1; # days;
$Nlimit = 10; # show top 10 of things

&Getopts('l:n:w:o');
$window = $opt_w if defined $opt_w;
$Nlimit = $opt_n if defined $opt_n;
$logfile = $opt_l if defined $opt_l;

open(logfile) || die("$logfile: $!\n");

$curtime = time; # get current time stamp
$timhrzn = $window * 86400; # 1 day time horizon (86400 seconds)

$hlimit=$Nlimit; # top n hosts
$dhitlimit=$Nlimit; # top n document hits

chop( $hostname = `uname -n` );

$timcut = $curtime - $timhrzn; # time cutoff

while(<logfile>) {
        ($time,$servicet,$who,$flags,$size,$op,$what) = split;
        ($flag,$stat, $action)= split(/\//,$flags);
        @T = localtime($time);
        $min = $T[1];
        $hour = $T[2];

        if($time >= $timcut) {
         $bin = int($time/300); # 5 min. granularity
         $sbin = int($servicet/100); # 100 msec granularity
         $sbin= 500 if ($servicet > 50000);
               $SHist{$sbin}++;
         $flagcount{$flag}++;
         $oldtime = $time unless ($oldtime);
         $user{$who}++;
         if(($flag eq 'TCP_HIT') || ($flag eq 'TCP_MISS') || ($flag eq
'TCP_IFMODSINCE') || ($flag eq 'UDP_HIT_OBJ') || ($flag eq 'TCP_REFRESH')){
          $TODbytes{$bin}+=$size;
          $TODbytes{$bin}+=$size if ($flag eq 'TCP_MISS');
          $TODbytes{$bin}+=$size if ($flag eq 'TCP_REFRESH');
          $TODbytes{$bin}+=$size if ($flag eq 'TCP_IFMODSINCE' && $action ne
'NONE');
          $totalsize += $size;
         }

         next if ($flag eq 'TCP_DONE');

         $tcp_hit{$what}++ if ($flag eq 'TCP_HIT');
         $udp_hit{$what}++ if ($flag eq 'UDP_HIT');
         $hitsize += $size if ($flag eq 'TCP_HIT');
         $hitsize += $size if ($flag eq 'UDP_HIT_OBJ');
         $hitsize += $size if ($flag eq 'TCP_IFMODSINCE' && $action eq 'NONE');
         $modsize += $size if ($flag eq 'TCP_IFMODSINCE');
         $staleno++ if ($flag eq 'TCP_IFMODSINCE' && $size > 250);
         $stalesize += $size if ($flag eq 'TCP_IFMODSINCE' && $size > 250);

         next unless ($flag =~ /(TCP_HIT|TCP_MISS|TCP_MISS_TTL|UDP_HIT_OBJ|TCP_
IFMODSINCE|TCP_REFRESH)/);

         if ($what =~ m'([^:]+)://([^:/]+)') {
          $scheme = $1;
          $host = $2;
          $host =~ tr/A-Z/a-z/;
          $rsite{$host}++;
         }

         $not_cachable++ if &not_cachable($what);
         $url_count++;

         if ($what =~ m'/$') {
          $type{'Directory'}++;
         } else {
          @F = split ('/', $what);
          $F = pop @F;
          @F = split ('\.', $F);
          $F = pop @F;
          $F =~ tr/A-Z/a-z/;
          $type{$F}++;
         }
         $TOD{$bin}++;
        }
}
close(infile);

# Summaries for host/time and timings
($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdat) = localtime($curtime);
printf "Subject: cache stats report for $hostname at $curtime -- %s/%s/%s
%.2d:%.2d:%.2d\n\n",
       $year,$mon+1,$mday, $hour,$min,$sec;
printf"HT $hostname at $curtime -- %s/%s/%s %.2d:%.2d:%.2d\n",
       $year,$mon+1,$mday, $hour,$min,$sec;
printf"TI $timhrzn : %d sec. ( %.1f days ) ago\n",
       $time-$oldtime,($time-$oldtime)/86400;

# Action counts
foreach $key ( sort flagsort ( keys %flagcount)) {
        printf "AC\t%d\t%s\n",
                $flagcount{$key},
                $key;
}

# Performance summary
printf "BS %7.3f MBytes served through the cache\n", $totalsize/1e6;
printf "BC %7.3f MBytes served from cached objects\n", $hitsize/1e6;
printf "SV %7.3f percent Internet traffic savings\n",100*$hitsize/$totalsize;
printf "US %7.3f percent in URL stop-list\n",100*$not_cachable/$url_count;
printf "GM %7.3f kBytes in GIM requests\n",$modsize/1e3;
printf "GM %7.3f stale objects\n",$staleno;
printf "GM %7.3f KBytes in stale objects\n",$stalesize/1e3;

# Cache accesses
printf "AL top $hlimit hosts using the cache\n";
@user_top_count=();
@user_top_host=();
foreach $key (keys %user)
{
        next if($user{$key} <= $user_top_count[$hlimit]);
        for($pos = $hlimit; $pos > 0; $pos--)
        {
                last if($user{$key} <= $user_top_count[$pos]);
                $user_top_count[$pos + 1] = $user_top_count[$pos];
                $user_top_host[$pos + 1] = $user_top_host[$pos];
                $user_top_count[$pos] = $user{$key};
                $user_top_host[$pos] = $key;
        }
}
for($pos = 1; $pos <= $hlimit; $pos++)
{
        ($a1,$a2,$a3,$a4) = split('\.',$user_top_host[$pos]);
        $name = gethostbyaddr(pack('C4',$a1,$a2,$a3,$a4),2);
        printf "HS %7d accesses from %15s %s\n", $user_top_count[$pos],
                $user_top_host[$pos], $name;
}

printf "TL top $dhitlimit TCP_HIT documents:\n";
@tcp_hit_top_count=();
@tcp_hit_top_doc=();
foreach $key (keys %tcp_hit)
{
        next if($tcp_hit{$key} <= $tcp_hit_top_count[$dhitlimit]);
        for($pos = $dhitlimit; $pos > 0; $pos--)
        {
                last if($tcp_hit{$key} <= $tcp_hit_top_count[$pos]);
                $tcp_hit_top_count[$pos + 1] = $tcp_hit_top_count[$pos];
                $tcp_hit_top_doc[$pos + 1] = $tcp_hit_top_doc[$pos];
                $tcp_hit_top_count[$pos] = $tcp_hit{$key};
                $tcp_hit_top_doc[$pos] = $key;
        }
}
for($pos = 1; $pos <= $dhitlimit; $pos++)
{
        printf "TH %7d hits for %s\n", $tcp_hit_top_count[$pos],
                $tcp_hit_top_doc[$pos];
}

printf "UL top $dhitlimit UDP_HIT documents:\n";
@udp_hit_top_count=();
@udp_hit_top_doc=();
foreach $key (keys %udp_hit)
{
        next if($udp_hit{$key} <= $udp_hit_top_count[$dhitlimit]);
        for($pos = $dhitlimit; $pos > 0; $pos--)
        {
                last if($udp_hit{$key} <= $udp_hit_top_count[$pos]);
                $udp_hit_top_count[$pos + 1] = $udp_hit_top_count[$pos];
                $udp_hit_top_doc[$pos + 1] = $udp_hit_top_doc[$pos];
                $udp_hit_top_count[$pos] = $udp_hit{$key};
                $udp_hit_top_doc[$pos] = $key;
        }
}
for($pos = 1; $pos <= $dhitlimit; $pos++)
{
        printf "UH %7d hits for %s\n", $udp_hit_top_count[$pos],
                $udp_hit_top_doc[$pos];
}

printf "RS top $Nlimit remote sites:\n";
@rsite_top_count=();
@rsite_top_site=();
foreach $key (keys %rsite)
{
        next if($rsite{$key} <= $rsite_top_count[$Nlimit]);
        for($pos = $Nlimit; $pos > 0; $pos--)
        {
                last if($rsite{$key} <= $rsite_top_count[$pos]);
                $rsite_top_count[$pos + 1] = $rsite_top_count[$pos];
                $rsite_top_site[$pos + 1] = $rsite_top_site[$pos];
                $rsite_top_count[$pos] = $rsite{$key};
                $rsite_top_site[$pos] = $key;
        }
}
for($pos = 1; $pos <= $Nlimit; $pos++)
{
        printf "SI %7d accesses for %s\n", $rsite_top_count[$pos],
                $rsite_top_site[$pos];
}

printf "UG top $Nlimit URL types:\n";
@type_top_count=();
@type_top_type=();
foreach $key (keys %type)
{
        next if($type{$key} <= $type_top_count[$Nlimit]);
        for($pos = $Nlimit; $pos > 0; $pos--)
        {
                last if($type{$key} <= $type_top_count[$pos]);
                $type_top_count[$pos + 1] = $type_top_count[$pos];
                $type_top_type[$pos + 1] = $type_top_type[$pos];
                $type_top_count[$pos] = $type{$key};
                $type_top_type[$pos] = $key;
        }
}
for($pos = 1; $pos <= $Nlimit; $pos++)
{
        printf "SI %7d accesses for %s\n", $type_top_count[$pos],
                $type_top_type[$pos];
}

foreach $key ( sort numerically ( keys %TOD)) {
 $tstamp=$key*300; # back to seconds
 @T = localtime($time);
 $min = $T[1];
 $hour = $T[2];

 printf"TD\t%d\t%d\tTCP Counts -- %d in/out bytes\n",
        $tstamp, $TOD{$key}, $TODbytes{$key};
}
foreach $key ( sort numerically ( keys %SHist)) {
 $tsize=$key*100; # back to seconds

  printf"ST\t%d-%d\tService Counts -- %d operations\n",
          $tsize,$tsize+100, $SHist{$key};
}

exit 0;

#========================================================================
sub flagsort { $flagcount{$b} <=> $flagcount{$a};}
sub usersort { $user{$b} <=> $user{$a} ;}
sub tcphitsort { $tcp_hit{$b} <=> $tcp_hit{$a} ;}
sub udphitsort { $udp_hit{$b} <=> $udp_hit{$a} ;}
sub rsitesort { $rsite{$b} <=> $rsite{$a} ;}
sub typesort { $type{$b} <=> $type{$a} ;}

sub numerically { $a <=> $b;}

sub not_cachable {
        $_ = shift;
        return 1 if (m'^http://.*cgi-bin');
        return 1 if (m'^http://.*\?');
        return 1 if (m'^gopher://.*\?');
        return 0;
}

--------------------------- cut here -----------------------------

-- 
Stewart Forster (Security and Web Proxy Cache Administrator)
connect.com.au pty ltd, Level 9, 114 Albert Rd, Sth Melbourne, VIC 3205, Aust.
Email: slf@connect.com.au   Phone: +61 3 9251-3684   Fax: +61 3 9251-3666
Received on Mon Oct 28 1996 - 14:25:26 MST

This archive was generated by hypermail pre-2.1.9 : Tue Dec 09 2003 - 16:33:23 MST