#!/usr/bin/perl
use Getopt::Long;
use strict;
##############################################
# SDPblocks.pl
#
# (C) Richard Mott & Oxford University , 2004, 2005
#
#
##############################################
#
#
# usage sdpblocks -variants Final.Variants.data.140604.txt
#
##############################################

my @previous;
my $start = 0;
my $rare = 10;
my $ignore = 1;
my $filename = "";
my $omit = 0;
my $C = 10;

GetOptions( "variants=s"=>\$filename,   "rare=s"=>\$rare, ignore=>\$ignore,  "omit!"=>\$omit, "C=s"=>\$C  );

my ($data, $strains, $variant, $nvariant ) = ReadVariants( $filename, $rare*$omit );

my @blocks;
$blocks[0] = BestSDPBlocks( $data, 0, $strains, 0 );
for( my $k=1;$k<=10;$k++) {
  $blocks[$k] = BestSDPBlocks( $data, $k, $strains, 0 );
  my @bl = @{$blocks[$k]};
}


sub ReadVariants {
  my ( $filename, $rare) = @_;
  my $data = [];

  open(FILE, $filename ) || die "Could not open $filename\n";

  my $chr = 1;
  my %variant;
  my %nvariant;
  my $previous;
  my $nblocks = 0;
  my $omitted = 0;
  my $m = 0;
  $_= <FILE>;
  my ( $variant, $pos, @strains ) = split;
  splice ( @strains, 8 );
#  print "@strains\n";
  while(<FILE>) {
    #    print;
    chomp;
    my ( $variant, $pos, @alleles ) = split;
    splice ( @alleles, 8 );
    my %al;
    my $n = 0;
    my @a = ();
    for ( my $k=0;$k<=$#alleles;$k++) {
      $al{$alleles[$k]} = $n++ if ( ! exists($al{$alleles[$k]}));
    }
    my @na = ( keys %al );
    my $diallelic = ( $#na == 1 ); # two alleles 
    #    print "$variant $diallelic @na\n";
    my $numeric = 0;
    foreach my $all ( @na ) {
      $numeric ++ if ( $all =~ /^\d+$/);
    }
    if ( $diallelic && ! $numeric ) {
      
      for( my $k=0;$k<=$#alleles;$k++) {
	$a[$k] = $al{$alleles[$k]};
      }
      my $v = join( "", @a );
      $data->[$m] = { name=>$variant, chr=>$chr, pos=>$pos, alleles=>\@alleles, numeric=>\@a,  variant=>$v };
      $variant{$v} ++;
      print join( "\t", $m+1, $data->[$m]{name}, $data->[$m]{chr}, $data->[$m]{pos}, join(",", @{$data->[$m]{alleles}}), $data->[$m]{variant} ) . "\n";
      $m++;
      
      @previous = @a;
      $nvariant{$v} = \@a;
#      printf "%15s %2s %15d %5d ", $variant, $chr, $pos, $nblocks;
#      print join( " ", @a ) . " " . join( " ", @alleles ) . "\n";
    }
    else {
      $omitted++;
    }
  }
  if ( $rare ) {# remove rare variants
    print "removing rare variants\n";
    my $n=0;
    my $ndata = [];
    for( my $k=0;$k<=$#$data;$k++) {
      if ( $variant{$data->[$k]{variant}} >= $rare ) {
	$ndata->[$n] = $data->[$k];
	$n++;
	print join( "\t", @{$data->[$k]}) . "\n";
      }
      else {
	$omitted++;
	printf "omitting $k $data->[$k]{variant}\n";
      }
    }

    $data = undef;
    $data = $ndata;

  }

  print "Read from $filename: $#$data diallelic markers, $omitted omitted markers data $data\n";
  return ($data, \@strains, \%variant, \%nvariant );
}



sub BestSDPBlocks {
  my ( $data, $C, $strains, $offset ) = @_;
  my %sdp;
  my $X = [];
  for( my $i=0;$i<=$#$data;$i++ ) {
    my $s = $data->[$i]{variant};
    $sdp{$s}++;
    $X->[$i]{$s} = 1; 
  }

  my $Y = [];
  my @s = keys %sdp;
    

  my $len = $#$data;
  foreach my $s ( @s ) { 
    if ( $X->[0]{$s} ) {
      $Y->[0]{$s}{Y} = 0;
      $Y->[0]{$s}{T} = $s;
    }
    else {
      $Y->[0]{$s}{Y} = 1;
      $Y->[0]{$s}{T} = $s;
    }
  }
  for( my $i=1;$i<=$len;$i++ ) {
      foreach my $s ( @s ) { 
	my $d = 1-$X->[$i]{$s};
	  my $y=1.0e10;
	  my $T;
	  foreach my $t ( @s ) {
	      my $u = $Y->[$i-1]{$t}{Y} + $C*($s ne $t) + $d;
	      if ( $u < $y ) {
		  $y = $u;
		  $T = $t;
	      }
	  }
	  $Y->[$i]{$s} = { Y=>$y, T=>$T };
      }
  }

  my $y=1.0e10;
  my $T;
  foreach my $s ( @s ) {
    if ( $Y->[$len]{$s}{Y} < $y ) {
      $y = $Y->[$len]{$s}{Y};
      $T = $s;
    }
  }


  my @tlabel;
  my $J = $len;
  while ( $J > 0 ) {
    my $end=$J;
    my $score = 1;
    while( $J>0 && $T eq $Y->[$J]{$T}{T} ) {
#      my $x = join( " ", keys %{$Y->[$J]{$T}} );
#      print "$J | $T|  $Y->[$J]{$T} | $Y->[$J]{$T}{T}|$Y->[$J]{$T}{Y}\n";
      $score += $X->[$J]{$T};
      $J--;
    }
    my $start = $J+1;
    my @x = split(//,$T);
    my $k=0;
    my $pattern = {};
    foreach my $st ( @$strains ) {
      $pattern->{$st} = $x[$k++];
    }
#    unshift @tlabel , [ $start, $end, $pattern ];
    unshift @tlabel , [ $start+$offset, $end, $pattern ];
    my $k = $end-$start+1;
    my $percent = $k >0 ? 100*$score/$k : 0;
    print "C = $C SDP block $start $end $T score $score len $k percent $percent\n";
    $T = $Y->[$J]{$T}{T};
  }

  return \@tlabel;
}


