#!/usr/bin/perl
use strict;
use warnings;


my $size = 10;
my $output = "";
my $number = 10;
my %generated_tags = ();
my $p1 = 0;
my $p2 = 1;
my $d = 0;
my $rmin = 1;
my $rmax = 100;
my $n = 0;

my $version = 0.53;
my $vname = "tag_count_generator.pl";

if (!$ARGV[0] || $ARGV[0]eq "-h") {
	print   "\n\t-s <integer> \t\t:: define the length (pb) of the tags to be generated [default=$size]\n",
	"\t-n <integer> \t\t:: define the number of unique sequences to be generated [default=$number]\n",
	"\t-N <real> <real> \t:: generate tags with a normal distribution and pre-defined mean (real number) and variance (>0)\n",
	"\t-G <real> <real> \t:: generate tags with a gamma distribution and the pre-defined scale parameter (theta) and shape parameter (alpha)\n",
	"\t-P <real> \t\t:: generate tags with a Poisson distribution and a pre-defined mean (lambda)\n",
	"\t-E <real> \t\t:: generate tags with an exponential distribution and a pre-defined parameter (lambda)\n",
	"\t-rmin <integer>\t\t:: define the minimum number of occurrences of a tag [default=$rmin]\n",
	"\t-rmax <integer>\t\t:: define the maximum number of occurrences of a tag [default=$rmax]\n",
	"\t-o <FILE> \t\t:: output filename [default=STDOUT]\n",
	"\t-h      \t\t:: display help information\n",
	"\t-v      \t\t:: display current version\n",
	"\n\t$vname Version $version\n\n";
	exit;
}               

if ($ARGV[0] eq "-v") {
	print ("\n\t$vname Version $version\n\n");
	exit;        
}         




#
#first read command line to collect arguments
#
while (my $args = shift(@ARGV)) {
	#
	#generated sequence size
	#
	if ($args eq "-s") {
		if ($args = shift(@ARGV)) {
			$size = $args;
			if ($size <= 0) {
				print STDERR "ERROR: SIZE NOT VALID [0:oo[\n";
				exit;
			}
		}
		else {
			print STDERR "ERROR: SIZE NOT VALID [0:oo[\n";
			exit;
		}
	}
	#
	#number of sequences to be generated
	#
	if ($args eq "-n") {
		if ($args = shift(@ARGV)) {
			$number = $args;
			if ($number <= 0) {
				print STDERR "ERROR NUMBER OF TAGS NOT VALID [0:oo[\n";
				exit;
			}
		}
		else {
			print STDERR "ERROR NUMBER OF TAGS NOT VALID [0:oo[\n";
			exit;
		}
	}
	#
	#set the distribution of the counts to a Normal
	#
	if ($args eq "-N") {
		if ($args = shift(@ARGV)) {
			$p1 = $args;
			$d = 1;
			if ($args = shift(@ARGV)) {
				$p2 = $args;
				if ($p2 <= 0) {
					print STDERR "ERROR: NORMAL VARIANCE NOT VALID ]0:oo[\n";
					exit;
				}
			}
			else {
				print STDERR "ERROR: NORMAL VARIANCE NOT VALID ]0:oo[\n";
				exit;
			}
		}
		else {
			$p1 = 0;
		}
	}


	#
	#set the distribution of the counts to a Poisson
	#
	if ($args eq "-P") {
			if ($args = shift(@ARGV)) {
				$d = 3;
				$p1 = $args;
				if ($p1 <= 0) {
					print STDERR "ERROR: LAMBDA PARAMETER NOT VALID ]0:oo[\n";
					exit;
				}
			}
			else {
				print STDERR "ERROR: LAMBDA PARAMETER NOT VALID ]0:oo[\n";
				exit;
			}
	}
	#
	#set the distribution of the counts to a Exponential
	#
	if ($args eq "-E") {
			if ($args = shift(@ARGV)) {
				$d = 4;
				$p1 = $args;
				if ($p1 <= 0) {
					print STDERR "ERROR: LAMBDA PARAMETER NOT VALID ]0:oo[\n";
					exit;
				}
			}
			else {
				print STDERR "ERROR: LAMBDA PARAMETER NOT VALID ]0:oo[\n";
				exit;
			}
	}



	#
	#set the distribution of the counts to a Gamma
	#
	if ($args eq "-G") {
		if ($args = shift(@ARGV)) {
			$p1 = $args;
			$d = 2;
			if ($args = shift(@ARGV) && $args > 0) {
				$p2 = $args;
			}
			else {
				print STDERR "ERROR: INVALID GAMMA K PARAMETER\n";
				exit;
			}
		}
		else {
			print STDERR "ERROR: INVALID GAMMA THETA PARAMETER\n";
			exit;
		}
	}
	#
	#generated sequence size
	#
	if ($args eq "-rmin") {
		if ($args = shift(@ARGV)) {
			$rmin = $args;
		}
		else {
			$rmin = 1;
		}
	}
	#
	#generated sequence size
	#
	if ($args eq "-rmax") {
		if ($args = shift(@ARGV)) {
			$rmax = $args;
		}
		else {
			print STDERR "ERROR: MAX NUMBER OF REPEATS INVALID\n";
			exit;
		}
	}
	#
	#output file
	#
	elsif ($args eq "-o") {
		if ($args = shift(@ARGV)) {
			$output = $args;
		}
		else {
			print STDERR "OUTPUT NOT SPECIFIED!\n";
			exit;
		}
	}

}



if ($output) {
	open OUTPUT,">$output" or die "COULD NOT OPEN $output : $!";
}

#
#to generate unique tags, I need to collect them to verify if
#later tags are not the same
#
for (my $j = 0; $j < $number; $j++){
	my $good = 0;
	my $sequence = "";
	while (!($good)){
		$sequence = "";
		for (my $i = 0; $i < $size; $i++) {
			#
			#randomly choose a base
			#
			my $number  =  int(rand(4));
			my $base;
			if ($number == 0){
				$base = "a";
			}
			elsif ($number == 1){
				$base = "c";
			}
			elsif ($number == 2){
				$base = "g";
			}
			else{
				$base = "t";
			}
			#
			#increment the sequence
			#
			$sequence .= $base;

		}
		#
		#we cannot generate a tag twice or
		#generate a tag with a catg in it
		#
		if (($sequence =~ m/catg/i) || exists($generated_tags{$sequence})){
			print STDERR "trying another sequence\n";
			$good = 0;
			if ($n >= $rmax) {
				$good = 1;
			}
		}
		else {
			$n++;
			$good = 1;
			$generated_tags{$sequence} = &getCount ($d, $p1, $p2, $rmin, $rmax);

		}
	}
}

my $sum = 0;
foreach my $key (sort(keys(%generated_tags))){
	&printout ("$key=" . $generated_tags{$key} . "\n");
	$sum += $generated_tags{$key};
}

print STDERR "SUM OF TAG COUNTS=$sum\n";


sub getCount {
	
	my $d = shift;
	my $p1 = shift;
	my $p2 = shift;
	my $min = shift;
	my $max = shift;
	my $r = 1;
	
	if ($d == 1) {
		$r = &countNormal ($p1, $p2, $min, $max);
	}
	elsif ($d == 2) {
		$r = &countGamma ($p1, $p2, $min, $max);
	}
	elsif ($d == 3) {
		$r = &countPoisson ($p1, $min, $max);
	}
	elsif ($d == 4) {
		$r = &countExponential ($p1, $min, $max);
	}
	else {
		$r = int (rand ($max - $min + 1) + $min);
	}

	return $r;
}

sub countNormal {
	my $n;
	my $mi = shift;
	my $ro = shift;
	my $min = shift;
	my $max = shift;

	$n = 0;
	while ($n < $min || $n > $max || $n < 1) {
		$n = int (&normal * $ro + $mi);
	}

	return $n;
}

sub normal {
	my $v1;
	my $v2;
	my $r;

	$r = 1;
	while ($r >= 1) {
		$v1 = 2 * rand() - 1;
		$v2 = 2 * rand() - 1;
		$r = $v1**2 + $v2**2;
	}
	$r = sqrt (-2 * log($r) / $r);

	return $v2 * $r;
}

sub countGamma {
	my $k = shift;
	my $theta = shift;
	my $min = shift;
	my $max = shift;
	my $ret = 0;
	while ($ret < $min || $ret > $max || $ret == 0) {
		$ret = int (&gamma_rand ($k, $theta));
	}

	return $ret;
}


sub gamma_rand {
	my $k = shift;
	my $theta = shift;
	my $sum = 0;
	my $ret;
	my $intk = int ($k);
	my $E = 0;

	for (my $i = 0; $i < $k; $i++) {
		$sum += log (rand());
	}

	$E = &gamma_func ($k - $intk);
	$ret = $theta * ($E - $sum);

	return $ret;
}


sub gamma_func {
	my $m = 0;
	my @v;
	my @E;
	my @n;
	my $e = 2.71828183;
	my $delta = shift;
	if (!$delta) {
		return 0;
	}
	$v[0] = $e / ($e + $delta);
	do {
		$m++;
		$v[2 * $m - 1] = rand();
		$v[2 * $m] = rand();
		if ($v[2 * $m - 2] <= $v[0]) {
			$E[$m] = $v[2 * $m - 1]**(1 / $delta);
			$n[$m] = $v[2 * $m] * $E[$m]**($delta - 1);
		}
		else {
			$E[$m] = 1 - log($v[2 * $m - 1]);
			$n[$m] = $v[2 * $m] * $e**(-$E[$m]);
		}
	} while ($n[$m] > ($E[$m]**($delta - 1) * $e**(-$E[$m])));

	return $E[$m];

}

sub countPoisson {
	my $lambda = shift;
	my $min = shift;
	my $max = shift;
	my $n = 1;

	do {
		$n = &poisson ($lambda);
	}	while ($n < $min || $n > $max);
		
	return $n;
}


sub poisson {
	my $lambda = shift;
	my $k = 0; 
	my $p = 1; 
	my $e = 2.71828183;
	my $L = $e**(-$lambda);

	while ($p >= $L) {
		$k++; 
		$p = $p * rand();
	}   

	return $k - 1;
}

sub countExponential {
	my $lambda = shift;
	my $min = shift;
	my $max = shift;
	my $n = 1;

	do {
		$n = int( &exponential ($lambda));
	}	while ($n < $min || $n > $max);
		
	return $n;
}

sub exponential {
	my $lambda = shift;

	my $r = rand();

	my $e = -log($r) / $lambda;

	return $e
}

sub printout {
	if ($output) {
		print OUTPUT shift();
	}
	else {
		print shift();
	}
}

