#!/usr/bin/perl
use strict;
#
#IMPORTANT: mininum_size and maximum_size at the
#moment can differ by only 1
#
my $minimum_size = 17;
my $maximum_size = 18;
my $coesive = 2;
my %hash = ();
my %count = ();
my $input = "";
my $output = "";
my $routput = "";

my $version = 0.46;
my $vname = "extract_tags.pl";

if (!$ARGV[0] || $ARGV[0]eq "-h") {
	print "\n\t-min <integer> \t:: set minimum size of the tags [default:$minimum_size]\n",
	"\t-max <integer> \t:: set maximum size of the tags [default:$maximum_size]\n",
	"\t-c <integer> \t:: set # of overlapping bases between tags [default:$coesive]\n",
	"\t-i <FILE> \t:: input file name [default:STDIN]\n",
	"\t-o <FILE> \t:: output file name [default:STDOUT]\n",
	"\t-r <FILE> \t:: report file name [default:STDERR]\n",
	"\t-h      \t:: display help information\n",
	"\t-v      \t:: display current version\n",
	"\n\t$vname Version $version\n\n";
	exit;
}

if ($ARGV[0] eq "-v") {
	print ("\n\t$vname Version $version\n\n");
	exit;
}


#
##first read command line to collect arguments
#
while (my $args = shift(@ARGV)) {
	#
	#coesive
	#
	if ($args eq "-c") {
		if ($args = shift(@ARGV)) {
			$coesive = $args;
		}
		else {
			$coesive = 0;
		}
	}
	#
	#minimun size
	#
	elsif ($args eq "-min") {
		if ($args = shift(@ARGV)) {
			$minimum_size = $args;
		}
		else {
			print STDERR "MINIMUM SIZE NOT SPECIFIED!\n";
			exit;
		}
	}

	#	
	#maximun size
	#
	elsif ($args eq "-max") {
		if ($args = shift(@ARGV)) {
			$maximum_size = $args;
		}
		else {
			print STDERR "MAXIMUM SIZE NOT SPECIFIED!\n";
			exit;
		}
	}

	#
	#input file
	#
	elsif ($args eq "-i") {
		if ($args = shift(@ARGV)) {
			$input = $args;
		}
		else {
			print STDERR "INPUT NOT SPECIFIED!\n";
			exit;
		}
	}
	#
	#output file
	#
	elsif ($args eq "-o") {
		if ($args = shift(@ARGV)) {
			$output = $args;
		}
		else {
			print STDERR "OUTPUT NOT SPECIFIED!\n";
			exit;
		}
	}
	#
	#output file
	#
	elsif ($args eq "-r") {
		if ($args = shift(@ARGV)) {
			$routput = $args;
		}
		else {
			print STDERR "REPORT OUTPUT NOT SPECIFIED!\n";
			exit;
		}
	}

}

if ($input) {
	open INPUT, "<$input" or die "COULD NOT OPEN $input :: $!\n";
	$input = "<INPUT>";
}
else {
	$input = "<>";
}

if ($output) {
	open OUTPUT, ">$output" or die "COULD NOT OPEN $output :: $!\n";
}

if ($routput) {
	open ROUTPUT, ">$routput" or die "COULD NOT OPEN $routput :: $!\n";
}
#
#fist separate individual tags from ditags
#


while (my $ditag = eval ($input)){
	chomp ($ditag);
	&clean ($ditag);
	#
	#get the ditag size
	#
	my $dt_size = length ($ditag);
	my $tag1;
	my $tag2;

	my $cut_th = &max ($dt_size + $coesive - $maximum_size, $minimum_size);
	$tag1 = substr ($ditag, 0, $cut_th);
	$tag1 = &cleanX ($tag1);
	$tag2 = &revcomp (substr ($ditag, -$cut_th));
	$tag2 = &cleanX ($tag2);

	my $size1 = length ($tag1);
	my $size2 = length ($tag2);
	
	if ($size1 <= $maximum_size && $size1 >= $minimum_size) {
		$count{$tag1}++;

		my $key = substr ($tag1, 0, $minimum_size);

		if (exists ($hash{$key})){
			if ($hash{$key} !~ m/\#$tag1\#/){
				$hash{$key} .= "$tag1#";
			}
		}
		else {
			$hash{$key} = "#$tag1#";
		} 
	}

	if ($size2 <= $maximum_size && $size2 >= $minimum_size) {
		$count{$tag2}++;

		my $key = substr ($tag2, 0, $minimum_size);

		if (exists ($hash{$key})){
			if ($hash{$key} !~ m/\#$tag2\#/) {
				$hash{$key} .= "$tag2#";
			}
		}
		else {
			$hash{$key} = "#$tag2#";
		} 
	}
}
my $total = 0;
#
#first print individual tag count
#
#print "INDIVIDUAL TAG COUNTS:\n";
foreach my $key (keys (%count)){
	&printout ("$key=$count{$key}\n");
	$total += $count{$key};
}


#
#now we try to make sense of tags with different sizes
#as of now, when tags of different sizes are compatible, we
#compute all counts to the tag with minimum size
#
&printerr ("PROCESSED TAG COUNTS:\n");
my $unique = (keys (%hash));
&printerr ("total different tags(may be redundant)=$unique\nConflicts:\n");
my $numberOfTagsInConflict = 0;
my $numberOfKeysInConflict = 0;
my $totalUniqueTags = 0;
foreach my $key (keys (%hash)){
	my $string = $hash{$key};
	#
	#need to remove first and last "#"
	#
	$string =~ s/^\#//;
	$string =~ s/\#$//;
	my @tags = split ("#",$string);
	&printerr ("DEBUG:$key=>$string<=\n");
	my $num_tags = @tags;
	@tags = sort {$b cmp $a} (@tags);
	my $partial_count = 0;
	my $non_redundant_tags = shift (@tags);
	$partial_count = $count{$non_redundant_tags};
	my $has_subtags;
	foreach my $tag (@tags) {
		#$totalGoodTags += $count{$tag};
		if ($non_redundant_tags !~ m/$tag/){
			$non_redundant_tags .= " $tag";
		}
		else {
			$has_subtags = 1;
		}
		$partial_count += $count{$tag};
	}
	my @final_tags = split (" ",$non_redundant_tags);
	my $num_tags = @final_tags;
	if ($num_tags > 1) {
		if ($has_subtags) {
			$numberOfKeysInConflict += 1;
			&printerr ("\t BAD, conflicts:$key", $count{$key}, ")==>");
			foreach my $tag (@final_tags){
				&printerr ("$tag($count{$tag}) ");
			}
			&printerr ("\n");
			$numberOfTagsInConflict += $partial_count;
		}
		else {
			&printerr ("=>OK, no conflict\n");
			foreach my $tag (@final_tags){
				my $count = $count{$tag};
				&printerr ("$tag=$count\n");
				$totalUniqueTags +=$count;
			}
		}


	}
	else {
		$totalUniqueTags += $partial_count;
		my $the_tag = shift(@final_tags);
		&printerr ("$the_tag=$partial_count \n");
	}
}
&printerr ("(REDUNDANT KEYS, TAGS IN CONFLICT, TOTAL TAGS, TOTAL GOOD TAGS)= ",
	"$numberOfKeysInConflict, $numberOfTagsInConflict, $total, $totalUniqueTags\n");
&printerr ("====\n");


sub revcomp {
	my $original = shift;
	$original =~ tr/acgtACGT/tgcaTGCA/;
	my @chars = split (//,$original);
	my @reversed;
	while (my $char = shift (@chars)){
		unshift (@reversed, $char);
	}
	my $result = join ("",@reversed);
	return $result;
}

sub printout {
	if ($output) {
		while (my $p = shift()) {
			print OUTPUT $p;
		}
	}
	else {
		while (my $p = shift()) {
			print $p;
		}
	}
}


sub printerr {
	if ($routput) {
		while (my $p = shift()) {
			print ROUTPUT $p;
		}
	}
	else {
		while (my $p = shift()) {
			print STDERR $p;
		}
	}
}


sub max {
	my $a = shift;
	my $b = shift;
	if ($a > $b) {
		return $a;
	}
	else {
		return $b;
	}
}

sub clean {
	my $string = shift;

	$string =~ s/^\s+//;
	$string =~ s/\s+$//;

	return $string;
}

sub cleanX {
	my $string = shift;

	$string =~ s/^X+//;
	$string =~ s/X+$//;

	return $string;
}

