#!/usr/bin/perl -w
## Name:	mini_weblog_cralwer.pl
## Purpose:	Search web server access log for response codes greater 399 and count unique IP addresses that cause
##			these codes. If number of trials reaches $threshold (parameter 2) check whois for those if scope
##			(param 4) is set to 'n'. Earliest (param 3) specifies how long to look back in Serverlog.
##			Block duration (param 3) specifies in days, hours or minutes how long an address will be blocked.
##			Search whois output and convert network address to CIDR notation, if required.
##			Print output with count of trials, ip address, network and last-seen date in case feedback
##			(param 5) is set to 'v' (verbose). If set to 's' (silent) only the list of ip addresses
##			or network addresses is printed. This output could be used by a wrapping script to extract the
##			network address and add them to existing firewall rules to block.

## Requires:
##	Module Net::CIDR - needs to be installed via CPAN for example on Mac Systems

## Change History
## Version:	1.0		2020-04-15	Initial version tested

use Net::CIDR;

my $num_args; my $logfile; my $threshold; my $earliest; my $scope; my $blk_duration; my $feedback;

$num_args     = $#ARGV + 1;
$logfile      = $ARGV[0] if ( $num_args > 0 );
$threshold    = $ARGV[1] if ( $num_args > 1 );
$earliest     = $ARGV[2] if ( $num_args > 2 );
$scope        = $ARGV[3] if ( $num_args > 3 );
$blk_duration = $ARGV[4] if ( $num_args > 4 );
$feedback     = $ARGV[5] if ( $num_args > 5 );

$feedback = 'v' if ( ! defined $feedback );

my $err_msg;
my @log_ips; my %ip_addrs;
my @reg_ips; my %ip_register;
my $ip_reg_file = "/etc/pf.ip_register";
my $dur_amt; my $dur_unit;

my $num_log_ips_above = 0; my $num_log_ips_below = 0;

my %months = ( Jan => "01", Feb => "02", Mar => "03", Apr => "04", May => "05", Jun => "06", Jul => "07", Aug => "08", Sep => "09", Oct => "10", Nov => "11", Dec => "12");
my $row;
my $ip; my $tstamp; my $html_code; my $year; my $month; my $day; my $hour; my $minute; my $second;

#-----------------------------------------------------------------------------
# Main
#-----------------------------------------------------------------------------
print STDERR "mini_weblog_crawler starting.\n" if ( $feedback eq 'v' );
#print STDERR "Parameters at start:\n\tlogfile=$logfile\n\threshold=t$threshold\n\tearlist=$earliest\n\tscope=$scope\n\t$blk_duration\n\tfeedback=$feedback\n";

if ( &_inits() == 0 and &_read_ip_reg() == 0 and &_read_weblog() == 0 ) {
	my $num_log_ips = $#log_ips + 1;
	print STDERR "$num_log_ips unique ip addresses detected in $logfile causing HTML response codes 400 and higher.\n" if ( $feedback eq 'v' or $feedback eq 't' );
	print STDERR "$num_log_ips_below unique ip addresses detected with number of trials below given threshold $threshold.\n" if ( $feedback eq 'v' or $feedback eq 't' );
	print STDERR "$num_log_ips_above unique ip addresses detected with number of trials above or equal given threshold $threshold.\n" if ( $feedback eq 'v' or $feedback eq 't' );

	print STDERR "Determining network addresses for remaining ip addresses from whois...\n" if ( ($feedback eq 'v' or $feedback eq 't' ) and $num_log_ips_above > 0 and $scope eq "n");
	print STDERR "trials\tfrom IP\t\tlast seen\n" if ( ($feedback eq 'v' or $feedback eq 't' ) and $num_log_ips_above > 0 and $scope eq "i");
	print STDERR "trials\tfrom IP\t\tnetwork\t\tlast seen\n" if ( ($feedback eq 'v' or $feedback eq 't' ) and $num_log_ips_above > 0 and $scope eq "n");

	$ip = "";
	foreach $ip ( @log_ips ) {
		print STDERR "L1: ip=$ip\n" if ( $feedback eq 't');
		if ( ! defined $ip_register{$ip} ) {
			if ( $ip_addrs{$ip}{counter} >= $threshold ) {
				if ( $scope eq 'n' ) {
					# translate address from single ip to network address
					$ip_register{$ip}{range} = &_call_WHOIS($ip);
				} else {
					$ip_register{$ip}{range} = "$ip/32";
				}
				my ($t_epoch, $t_human) = split(/,/,&getTimeStamp());
				$ip_register{$ip}{blocked}      = "blocked new";
				$ip_register{$ip}{until_epoch}  = $t_epoch;
				$ip_register{$ip}{until_human}  = $t_human;
				$ip_register{$ip}{trials}       = $ip_addrs{$ip}{counter};
				$ip_register{$ip}{tstamp}       = $ip_addrs{$ip}{tstamp};
			}
		}
	}
	# now store newly found ip addresses in %ip_register
	# and print all that have to be blocked (again or new)
	@reg_ips = sort(keys %ip_register);
	foreach my $ip ( @reg_ips ) {
		print STDERR "L2: ip=$ip\t$ip_register{$ip}{blocked}\tIP address:\t$ip\tnumber of trials:\t$ip_register{$ip}{trials}\n" if ( $feedback eq 't');
		# those that have a value in block duration are to be blocked / registered
		if ( $ip_register{$ip}{blocked} eq "blocked new" or $ip_register{$ip}{blocked} eq "block remains" ) {
			if ( $scope eq 'n' ) {
				if ( $feedback eq 'v' ) {
					print "$ip_register{$ip}{trials}\t$ip\t$ip_register{$ip}{range}\t$ip_register{$ip}{tstamp}\n";
				} else {
					print "IP address:\t$ip_register{$ip}{range}\tnumber of trials:\t$ip_register{$ip}{trials}\n";
				}
			} else {
				if ( $feedback eq 'v' ) {
					print "$ip_register{$ip}{trials}\t$ip\t$ip_register{$ip}{tstamp}\n";
				} else {
					print "IP address:\t$ip\tnumber of trials:\t$ip_register{$ip}{trials}\n";
				}
			}
		}
	}
	# finally dump the ip_register
	my $dummy = &_write_ip_reg();
} else {
	print STDERR "$err_msg\n" if ( $err_msg );
}

print STDERR "mini_weblog_crawler done.\n" if ( $feedback eq 'v' or $feedback eq 't' );
exit 0;

# -----------------------------------------------------------------------------
# Create an actual time stamp
# -----------------------------------------------------------------------------
sub getTimeStamp {
	my $func = 0;
	print STDERR "s: getTimeStamp\n" if ( $feedback eq 't' );
	my $time=time();

	$time = $time + ($dur_amt * 24 * 3600) if ( $dur_unit eq'd');
	$time = $time + ($dur_amt * 3600) if ( $dur_unit eq 'h');
	$time = $time + ($dur_amt * 60) if ( $dur_unit eq 'm');

	my ($sec,$min,$hour,$day,$month,$year,$wday,$yday,$dst) = localtime($time);
	my $mname_lng; my $mname_abb; my $dname_lng; my $dname_abb;

#	print "y=$year m=$month d=$day h=$hour m=$min s=$sec\n";
	$year += 1900;
	$month += 1;
	$year  = sprintf("%04d",$year);
	$month = sprintf("%02d",$month);
	$day   = sprintf("%02d",$day);
	$hour  = sprintf("%02d",$hour);
	$min   = sprintf("%02d",$min);
	$sec   = sprintf("%02d",$sec);

	print STDERR "e: getTimeStamp\n" if ( $feedback eq 't' );
	return "$time,$year-$month-$day $hour:$min:$sec";
}

#-----------------------------------------------------------------------------
# Parameter checks
#-----------------------------------------------------------------------------
sub _inits {
	my $func_rc = 0;
	print STDERR "s: _inits\n" if ( $feedback eq 't' );
	$feedback      = 'v' if ( $feedback ne 's' and $feedback ne 't' );
	my $actual = `date +"%Y-%m-%d %H:%M:%S"`;
	chop $actual;
	$actual =~ m/^([\d]{4})-([\d]{2})-([\d]{2}) ([\d]{2}):([\d]{2}):([\d]{2})$/;
	my $a_year = $1; my $a_month = $2; my $a_day = $3; my $a_hour = $4; my $a_minute = $5; my $a_second = $6;

	if ( ! defined $logfile or $logfile eq "" ) {
		print STDERR "Name and path to logfile not given. Terminating.\n";
		$func_rc = 8;
	}

	if ( ! defined $threshold or $threshold == 0 or ! ($threshold =~ m/^[\d]{1,5}$/)) {
		print STDERR "Threshold not given, not numeric or not in range 1..99999. Terminating.\n";
		$func_rc = 12;
	}

	if ( defined $earliest and $earliest =~ m/^([\d]{4})-([\d]{2})-([\d]{2}) ([\d]{2}):([\d]{2}):([\d]{2})$/ ) {
		my $e_year = $1; my $e_month = $2; my $e_day = $3; my $e_hour = $4; my $e_minute = $5; my $e_second = $6;
		if ( $earliest gt $actual ) {
			print STDERR "Earliest set to future date_time. Terminating.\n";
			$func_rc = 4;
		}
	} elsif ( defined $earliest and $earliest =~ m/^([\d]{4})-([\d]{2})-([\d]{2})$/ ) {
		my $e_year = $1; my $e_month = $2; my $e_day = $3;
		$earliest = "$e_year-$e_month-$e_day 00:00:00";
		print STDERR "Earliest time omitted (only date specified). Time set to 00:00:00. New earliest now: $earliest\n";
	} else {
		print STDERR "Earliest date time to search for not given or not in format yyyy-mm-dd hh:mm:ss. Terminating.\n";
		$func_rc = 12;
	}

	if ( ! defined $scope or ! ($scope =~ m/^[in]{1}$/)) {
		print STDERR "Scope not defined or not 'i' for IP addresses or 'n' for complete networks. Terminating.\n";
		$func_rc = 12;
	}

	if ( ! defined $blk_duration or ! ($blk_duration =~ m/^([\d]{1,2})([d|h|m]{1})$/)) {
		print STDERR "Block duration $blk_duration not defined or not in range 1..99 d|h|m (days|hours|minutes). Terminating.\n";
		$func_rc = 12;
	} else {
		$dur_amt = $1; $dur_unit=$2;
		if ( $dur_amt == 0 ) {
			print STDERR "Block duration $blk_duration not in range 1..99 d|h|m (days|hours|minutes). Terminating.\n";
			$func_rc = 12;
		}
	}
	print STDERR "e: _inits\n" if ( $feedback eq 't' );
	return $func_rc;
}

#-----------------------------------------------------------------------------
# Read ip register file into hash %ip_register
# Then loop through list and clear those entries that have reached their due time
#-----------------------------------------------------------------------------
sub _read_ip_reg {
	my $func_rc = 0;
	print STDERR "s: _read_ip_reg\n" if ( $feedback eq 't' );
	if ( open (IPREG, '<', $ip_reg_file) ) {
		while ( <IPREG> ) {
			chomp $_;
			$row = $_;
			my ($r_ip, $r_rng, $r_blk, $r_epo, $r_hum, $r_trials, $r_occt, @rest) = split(/,/,$row);
			# skipt "unblocked" entries; they've been recorded to log the unblock event
			if ( $r_blk ne "unblocked" ) {
				$ip_register{$r_ip}{range}       = $r_rng;
				$ip_register{$r_ip}{blocked}     = $r_blk;
				$ip_register{$r_ip}{until_epoch} = $r_epo;
				$ip_register{$r_ip}{until_human} = $r_hum;
				$ip_register{$r_ip}{trials}      = $r_trials;
				$ip_register{$r_ip}{tstamp}      = $r_occt;
			}
		}
		close IPREG;
		@reg_ips = sort(keys %ip_register);
		print STDERR "num read reg_ips=$#reg_ips\n" if ( $feedback eq 't' );
		my $epoch_now=time();
		# check for outdated registered ip addresses, set block duration to ""
		foreach my $ip ( @reg_ips ) {
			$ip_register{$ip}{blocked} = "unblocked" if ( $epoch_now >= $ip_register{$ip}{until_epoch} );
			$ip_register{$ip}{blocked} = "block remains" if ( $ip_register{$ip}{blocked} eq "blocked new" );
		}
	} else {
		$func_rc = 12;
		$err_msg = "File $ip_reg_file does not exist or is not accessible. Terminating.";
	}
	print STDERR "e: _read_ip_reg\n" if ( $feedback eq 't' );
	return $func_rc;
}

#-----------------------------------------------------------------------------
# Write ip register from hash %ip_register to file
# but only those which have a value in $reg_ips{$ip}{until_epoch}
#-----------------------------------------------------------------------------
sub _write_ip_reg {
	my $func_rc = 0;
	print STDERR "s: _write_ip_reg\n" if ( $feedback eq 't' );
	my $prtrow;
	@reg_ips = sort(keys %ip_register);
	if ( open (IPREG, '>', $ip_reg_file) ) {
		foreach my $ip ( @reg_ips ) {
			$prtrow = "$ip,$ip_register{$ip}{range},$ip_register{$ip}{blocked},$ip_register{$ip}{until_epoch},$ip_register{$ip}{until_human},$ip_register{$ip}{trials},$ip_register{$ip}{tstamp}\n";
			print IPREG "$prtrow";
			print STDERR "ip_reg_file: $prtrow" if ( $feedback eq 't' );
		}
		close IPREG;
	} else {
		$func_rc = 12;
		$err_msg = "File $ip_reg_file could not be written. Terminating.";
	}
	print STDERR "e: _write_ip_reg\n" if ( $feedback eq 't' );
	return $func_rc;
}

#-----------------------------------------------------------------------------
# Read weblog into hash %ip_addrs
#-----------------------------------------------------------------------------
sub _read_weblog {
	my $func_rc = 0;
	print STDERR "s: _read_weblog\n" if ( $feedback eq 't' );
	if ( open (WEBLOG, '<', $logfile) ) {
		while ( <WEBLOG> ) {
			chomp $_;
			$row = $_;
			$row =~ m/^(.+) - - \[(.+)\] ".+" ([\d]+) .*$/s;
			$ip = $1; $tstamp = $2; $html_code = $3;

			$tstamp =~ m/^([\d]{1,2})\/([\w]{3})\/([\d]{4}):([\d]{2}):([\d]{2}):([\d]{2}) .+$/;
			$day = $1; $month = $months{$2}; $year = $3; $hour = $4; $minute = $5; $second = $6;
			$tstamp = "$year-$month-$day $hour:$minute:$second";

			if ( $tstamp ge $earliest ) {
				if ( $html_code >= 400 ) {
					$ip_addrs{$ip}{counter} += 1;
					$ip_addrs{$ip}{tstamp} = $tstamp;
				}
			}
		}
		close WEBLOG;
		@log_ips = sort(keys %ip_addrs);
		$ip = "";
		foreach $ip ( @log_ips ) {
			if ( $ip_addrs{$ip}{counter} >= $threshold ) {
				$num_log_ips_above += 1;
			}
			if ( $ip_addrs{$ip}{counter} < $threshold ) {
				$num_log_ips_below += 1;
			}
		}
	} else {
		$func_rc = 12;
		$err_msg = "File $logfile does not exist or is not accessible. Terminating.";
	}
	print STDERR "e: _read_weblog\n" if ( $feedback eq 't' );
	return $func_rc;
}

#-----------------------------------------------------------------------------
# Read weblog into hash %ip_addrs
#-----------------------------------------------------------------------------
sub _call_WHOIS {
	my $ip = $_[0];
	print STDERR "s: _call_WHOIS\n" if ( $feedback eq 't' );
	my $ip_range;
	if ( $ip ) {
		my $WHOIS = `whois $ip`;
		if ( $WHOIS =~ m/^.*route:[\s]+([0-9,\.\/ ]+).*$/m ) {
			$ip_range  = $1;
		} elsif ( $WHOIS =~ m/^.*CIDR:[\s]+([0-9,\.\/ ]+).*$/m ) {
			$ip_range = $1;
			if ( $ip_range =~ m/,/ ) {
				my @range_list = split /\,/, $ip_range;
				@range_list = sort ( @range_list );
				$ip_range = $range_list[0];
				$ip_range =~ s/^\s+|\s+$//g
			}
		} elsif ( $WHOIS =~ m/^.*inetnum:[\s]+([0-9,\.\/\- ]+)(?!.*inetnum:[\s]+([0-9,\.\/\- ]+)).*$/s ) {
			$ip_range=$1;
			if ( $ip_range =~ m/(.+) - (.+)$/) {
				$ip_range = "$1-$2";
				$ip_range = Net::CIDR::range2cidr($ip_range);
			}
		} else {
			$ip_range="n/a";
		}
	}
	print STDERR "e: _call_WHOIS\n" if ( $feedback eq 't' );
	return $ip_range;
}

exit 1;