#!/usr/bin/perl -w
#
# Given a 'results' dir from a bayes-10pcv-driver run,
# graph a ROC curve of accuracy.
#
# usage: graph-accuracy-curve [--buckets=100] ...dir/results .../dir2/results ...
#
# <@LICENSE>
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to you under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at:
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# </@LICENSE>


use Getopt::Long;

our $opt_buckets;
GetOptions("buckets=i");

my $buckets = $opt_buckets || 100;
my $range_lo = 0.0;
my $range_hi = 1.0;

%bux_sp = ();
%bux_ns = ();

my $step = ($range_hi - $range_lo) / $buckets;
my $i;
for ($i = $range_lo; $i <= $range_hi; $i += $step) {
  push (@buckets, $i);
}

open(DATA, ">plot.data");
my $setcount = 0;
my %tag = ();
my @dirs = ();
foreach my $dir (@ARGV) {
  for ($i = $range_lo; $i <= $range_hi; $i += $step) {
    $bux_ns{$i} = $bux_sp{$i} = 0;
  }

  dofile($setcount, "$dir/spam_all.log", "$dir/nonspam_all.log");
  push (@dirs, $dir);
  $tag{$dir} = $setcount;
  $setcount++;
}
close DATA;

open (OUT, "| gnuplot -") or die "cannot run gnuplot";
select(OUT);

# set xtics 0,0.1,0.99

print "
set xlabel 'FPs'
set ylabel 'FNs'
set logscale xy 2
set xrange []
set yrange []
set terminal png size 1024,768 crop
set out 'graph.png'

plot ";

my @text = ();
my $t = 0;
foreach my $dir (@dirs) {
  my $s = $tag{$dir};
  $t++; push (@text, "  'plot.data' using 1:2 index $s with linesp lt $t pt $t t 'ham, $dir'");
}

print join(", \\\n", @text);
print "\n";

close OUT;
exit;


sub dofile {
  my ($setcount, $spam, $nonspam) = @_;

  foreach my $file ($spam, $nonspam) {
    open (IN, "<$file") || die "Could not open file '$file': $!";

    my $isspam = 0; ($file eq $spam) and $isspam = 1;

    while (<IN>) {
      /^(\.|Y)\s.+bayes=([^\s,]+)/ or next;
      my $score = $2+0;

      my $bucket_id;
      foreach my $bucket (@buckets) {
        if ($score >= $bucket && $score < $bucket+$step) {
          $bucket_id = $bucket; last;
        }
      }

      if ($isspam) {
        $bux_sp{$bucket_id}++;
      } else {
        $bux_ns{$bucket_id}++;
      }
    }
  }

  foreach my $bucket (@buckets) {
    my ($fp, $fn) = results_for_cutoff($bucket);
    print DATA "$fp $fn\n";
  }
  print DATA "\n\n";
}

sub results_for_cutoff {
  my $cutoff = shift;
  my $fn = 0;
  my $fp = 0;

  for ($i = $range_lo; $i < $cutoff; $i += $step) {
    foreach my $bucket (@buckets) {
      if ($i >= $bucket && $i < $bucket+$step) {
        $fn += $bux_sp{$bucket};
      }
    }
  }
  for ($i = $cutoff; $i <= $range_hi; $i += $step) {
    foreach my $bucket (@buckets) {
      if ($i >= $bucket && $i < $bucket+$step) {
        $fp += $bux_ns{$bucket};
      }
    }
  }

  return ($fp, $fn);
}

