/**
 * clasp.c
 * fast fragment chaining
 * using sop gap costs
 *
 * @author Christian Otto
 * @email christian@bioinf.uni-leipzig.de
 * @company Bioinformatics, University of Leipzig
 * @date Thu May 22 15:19:03 CEST 2008
 */

/*
 * SVN
 * Revision of last commit: $Rev: 116 $
 * Author: $Author: steve $
 * Date: $Date: 2010-06-30 13:51:27 +0200 (Wed, 30 Jun 2010) $
 * Id: $Id: clasp.c 116 2010-06-30 11:51:27Z steve $
 * Url: $URL: http://www.bioinf.uni-leipzig.de/svn/segemehl/segemehl/branches/esa/trunk/src/clasp.c $
 */

#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include <sys/times.h>
#include "info.h"
#include "debug.h"
#include "container.h"
#include "manopt.h"
#include "fileio.h"
#include "sltypes.h"
#include "slchain.h"
#include "rangetree.h"
#include "clasp.h"

unsigned char mute = 0;
extern double maxmem;

int main (int argc, char** argv){
  claspinfo_t info;
  manopt_optionset optset;
  manopt_arg *unflagged;
  manopt_arg *list;
  Uint i, j, k, begin;
  int num;
  time_t start, end;
  double chtime, intime;
  bl_claspinfoInit(&info);
	
  manopt_initoptionset(&optset, argv[0], NULL,
		       " Fast fragment chaining using sum-of-pair gap costs\n",
		       " clasp is free software for non-commercial use\n\
 (C) 2010 Bioinformatik Leipzig\n", VERSION,
		       " Please report bugs to christian@bioinf.uni-leipzig.de");
  manopt_blockseparator(&optset, "INPUT/OUTPUT");
  manopt(&optset, REQSTRINGOPT, 1, 'i', "input",
	 "path/filename of fragment file", "<file>",
	 NULL, &info.infilename);
  manopt(&optset, REQSTRINGOPT, 0, 'o', "out",
	 "path/filename of output file", "<file>",
	 NULL, &info.outfilename); 
  manopt(&optset, LISTOPT, 0, 'c', "cols",
	 "select column numbers with position information/score",
	 "<qry_st> <qry_end> <db_st> <db_end> <score>",
	 NULL, NULL); 
  manopt(&optset, LISTOPT, 0, 'C', "idcols",
	 "select column number(s) with identifiers", "<n>...<n>",
	 NULL, NULL);
  manopt(&optset, FLAG, 0, 'f', "fragment",
	 "report fragments", NULL, NULL, &info.outputf);
  manopt(&optset, FLAG, 0, 'O', "orig",
  	 "original output of fragments", NULL,
  	 NULL, &info.outputorig);
  manopt(&optset, FLAG, 0, 'm', "mute",
	 "shut up!", NULL, NULL, &mute);
  manopt_blockseparator(&optset, "GENERAL");
  manopt(&optset, FLAG, 0, 'L', "lin",
	 "use linear gap costs", NULL, NULL, NULL);
  //manopt(&optset, FLAG, 0, 'P', "sop",
  //	 "use sum-of-pair gap costs (default)", NULL, NULL, NULL);  
  manopt(&optset, REQDBLOPT, 0, 'l', "lambda",
	 "chaining parameter lambda", "<double>",
	 NULL, &info.lambda);
  manopt(&optset, REQDBLOPT, 0, 'e', "epsilon",
	 "chaining parameter epsilon", "<double>",
	 NULL, &info.epsilon);
  manopt(&optset, REQINTOPT, 0, 'G', "maxgap",
	 "maximal gap", "<n>",
	 NULL, &info.maxgap);
  manopt(&optset, REQDBLOPT, 0, 'S', "minscore",
  	 "minimal score", "<double>",
  	 NULL, &info.minscore);
  manopt(&optset, REQUINTOPT, 0, 'F', "minfrag",
  	 "minimal number of fragments", "<n>",
  	 NULL, &info.minfrag);

  unflagged = manopt_getopts(&optset, argc, argv);
  if(unflagged->noofvalues > 1) {
    manopt_help(&optset, "unknown argument(s)\n");
  }

  /* by default: sum-of-pair gap costs */
  if(manopt_isset(&optset, 'L', "lin")){
    info.chainmode = LIN;
  }
  else {
    info.chainmode = SOP;
  }

  if (manopt_isset(&optset, 'G', "maxgap") &&
      info.maxgap < 0 && info.maxgap != -1){
    manopt_help(&optset, "int argument '%d' for \
option 'G' (maxgap) must be either -1 or positive\n",
		info.maxgap);
  }
  /* 
   * first value = col of qry begin, second value = col of qry end,
   * third value = col of seq begin, fourth value = col of seq end,
   * fifth value = col of score
   * (column numbers start with 1)
   */
  list = manopt_getarg(&optset, 'c', "cols");
  info.colorder = (Uint *) malloc(sizeof(Uint) * 5);
  if (manopt_isset(&optset, 'c', "cols")){
    if (list->noofvalues != 5){
      manopt_help(&optset, "please give five column numbers \
using -c <qry_st> <qry_end> <db_st> <db_end> <score> option\n");
    }
    else {
      for (i = 0; i < list->noofvalues; i++){	
	num = atoi(list->values[i]);
	if (num <= 0 ){
	  manopt_help(&optset, "unsigned non-zero int argument '%s' for \
option 'c' (cols) out of range\n", list->values[i]);
	}
	info.colorder[i] = num - 1;
	/* check for pair-wise disjunction */
	for (j = 0; j < i; j++){
	  if (info.colorder[j] == info.colorder[i]){
	    DBG("Warning: column number '%d' is assigned multiple times.\n", 
		info.colorder[i] + 1);
	  }
	}
      }
    }
  }
  else {
    for (i = 0; i < 5; i++){
      info.colorder[i] = i;
    }
  }

  list = manopt_getarg(&optset, 'C', "idcols");
  if (manopt_isset(&optset, 'C', "idcols")){
    if (list->noofvalues < 1){      
      manopt_help(&optset, "please give at least one identifier column number \
using -C <n>...<n> option\n");
    }
    info.idcolnum = list->noofvalues;
    info.idcol = (Uint *) malloc(sizeof(Uint) * info.idcolnum);
    for (i = 0; i < info.idcolnum; i++){
      num = atoi(list->values[i]);
      if (num <= 0 ){
	manopt_help(&optset, "unsigned non-zero int argument '%s' for \
option 'C' (idcols) out of range\n", list->values[i]);
      }
      info.idcol[i] = num - 1;
      /* check for pair-wise disjunction */
      for (j = 0; j < i; j++){
	if (info.idcol[j] == info.idcol[i]){
	  DBG("Warning: identifier column number '%d' is assigned multiple times.\n", 
	      info.idcol[i] + 1);
	}
      }
      for (j = 0; j < 5; j++){	  
	if (info.colorder[j] == info.idcol[i]){
	  DBG("Warning: identifier column number '%d' is already defined as a \
position or score information column.\n", info.idcol[i] + 1);
	}
      }
    }
  }
    
  if(info.outfilename){
    info.dev = fopen(info.outfilename, "w");
  }

  if(info.dev == NULL){
    DBG("Couldn't open file '%s'. Exit forced.\n", info.outfilename);
    exit(-1);
  }
  /* initialization */
  info.fragments = (Container *) malloc(sizeof(Container));
  bl_containerInit(info.fragments, 1000, sizeof(slmatch_t));
  /* lines only if original output format is required */
  if (info.outputorig){
    info.lines = (Container *) malloc(sizeof(Container));
    bl_containerInit(info.lines, 1000, sizeof(char *));
  }
  /* subject only required with id column(s) */
  if (info.idcol != NULL){    
    info.subject = (Container *) malloc(sizeof(Container));
    bl_containerInit(info.subject, 100, sizeof(char *));
  }
  /* read input file */
  time(&start);
  bl_slmatchInitFromFile(info.fragments, info.lines, info.subject, 
			 info.infilename, "\t", info.colorder,
			 info.idcol, info.idcolnum);  
  time(&end);
  intime = difftime(end, start);
  NFO("reading input has taken %.2f seconds.\n", intime);

  /* clustering and chaining */
  time(&start);
  /* output header */
  bl_slWriteHeader(&info);
  
  /* sort fragments */
  qsort(info.fragments->contspace, bl_containerSize(info.fragments),
	sizeof(slmatch_t), cmp_slmatch_qsort);
  begin = 0;
  for (i = 1; i <= bl_containerSize(info.fragments); i++){
    /* 
     * end of fragments list or different database sequence 
     * --> process fragment[begin]...fragment[i-1], write output
     *     and free chains (less memory consumption with large input files)
     */
    if (i == bl_containerSize(info.fragments) ||
	((slmatch_t *) bl_containerGet(info.fragments, begin))->subject !=
	((slmatch_t *) bl_containerGet(info.fragments, i))->subject){
      //fprintf(info.dev, "%d\t%d\n", begin, i-begin);
      if (info.chainmode == SOP){
	/* only use chaining without clustering if no ids are specified */
	//bl_slChainSop((slmatch_t *) info.fragments->contspace + begin, i - begin,
	//	      info.epsilon, info.lambda);  
	bl_slClusterSop((slmatch_t *) info.fragments->contspace + begin, i - begin,
			info.epsilon, info.lambda, info.maxgap);
      }
      else {    
	//bl_slChainLin((slmatch_t *) info.fragments->contspace + begin, i - begin,
	//	      info.epsilon, info.lambda);
	bl_slClusterLin((slmatch_t *) info.fragments->contspace + begin, i - begin,
			info.epsilon, info.lambda, info.maxgap);
      }
 
      for (j = begin; j < i; j++){
	slmatch_t *match = (slmatch_t *) bl_containerGet(info.fragments, j);

	/* output matches (if desired) */
	if (info.outputm){
	  fprintf(info.dev, "M\t");
	  if (!info.outputorig){
	    if (info.idcol != NULL){
	      fprintf(info.dev, "%s\t",
		      *(char **) bl_containerGet(info.subject, match->subject));
	    }
	    fprintf(info.dev, "%d\t%d\t%d\t%d\t%.3f\n", match->i,
		    match->i + match->j - 1, match->p,
		    match->p + match->q - 1, match->scr);
	  }
	  /* output in original format as input */
	  else {
	    fprintf(info.dev, "%s\n", *(char **) bl_containerGet(info.lines, j));
	  }
	}
	if (match->chain){
	  slchain_t *chain = (slchain_t *) match->chain;
	  if (info.outputc && chain->scr >= info.minscore &&
	      bl_containerSize(chain->matches) >= info.minfrag){
	    fprintf(info.dev, "C\t");
	    if (info.idcol != NULL){
	      fprintf(info.dev, "%s\t", *(char **) bl_containerGet(info.subject, chain->subject));
	    }
	    fprintf(info.dev, "%d\t%d\t%d\t%d\t%.3f\n", chain->i,
		    chain->i + chain->j - 1, chain->p,
		    chain->p + chain->q - 1, chain->scr);
	  }
	  /* output chains and fragments (if requested) */
	  if (info.outputf && chain->scr >= info.minscore &&
	      bl_containerSize(chain->matches) >= info.minfrag){
	    for (k = 0; k < bl_containerSize(chain->matches); k++){
	      slmatch_t *frag = *(slmatch_t **)
		bl_containerGet(chain->matches, k);
	      fprintf(info.dev, "F\t");
	      if (!info.outputorig){
		if (info.idcol != NULL){
		  fprintf(info.dev, "%s\t",
			  *(char **) bl_containerGet(info.subject, frag->subject));
		}
		fprintf(info.dev, "%d\t%d\t%d\t%d\t%.3f\n", frag->i,
			frag->i + frag->j - 1, frag->p, frag->p + frag->q - 1,
			frag->scr);
	      }
	      /* output in original format as input */
	      else {
		fprintf(info.dev, "%s\n",
			*(char **) bl_containerGet(info.lines, frag->idx));
	      }
	    }
	  }
	  bl_slchainDestruct(chain);
	  free(chain);
	  match->chain = NULL;
	} /* END OF if (frag->chain) */
      }  /* END OF for (j = begin; j < i; j++) */
      begin = i;
    } /* END OF  if (i == bl_containerSize(info.fragments) ||
	((slmatch_t *) bl_containerGet(info.fragments, begin))->subject !=
	((slmatch_t *) bl_containerGet(info.fragments, i))->subject) */      
  } /* END OF for (i = 1; i <= bl_containerSize(info.fragments); i++) */
  time(&end);
  chtime = difftime(end, start);

  NFO("chaining has taken %.2f seconds.\n", chtime);
  #ifdef MAXMEM
  NFO("peak virtual memory was %.2f MB.\n", (maxmem * 4)/1024.0);
  #endif
  /* destruct everything */
  bl_claspinfoDestruct(&info);
  manopt_destructoptionset(&optset);
  manopt_destructarg(unflagged);	
  free(unflagged);
  return 0;
}

/*----------------------------- bl_slWriteHeader -------------------------------
 *    
 * @brief       output of base header (including parameters) and
 *              the column descriptions
 * @author      Christian Otto
 *   
 */
void bl_slWriteHeader(void *data){  
  Uint i;
  time_t rawtime;
  struct tm *timeinfo;
  claspinfo_t *info = (claspinfo_t *) data;
  /* output header */
  time(&rawtime);
  timeinfo = localtime (&rawtime);
  fprintf(info->dev, "# clasp\t\t%s", asctime(timeinfo));
  fprintf(info->dev, "# version\t\t%s\n", VERSION);
  fprintf(info->dev, "# inputfile\t\t%s\n", info->infilename);
  if (info->outfilename){
    fprintf(info->dev, "# outputfile\t\t%s\n", info->outfilename);
  }
  else {
    fprintf(info->dev, "# output\t\tstdout\n");
  }
  fprintf(info->dev, "# col_order\t\t");
  fprintf(info->dev, "qry_st:%d, qry_end:%d, db_st:%d, db_end:%d, score:%d\n",
	  info->colorder[0] + 1, info->colorder[1] + 1, info->colorder[2] + 1,
	  info->colorder[3] + 1, info->colorder[4] + 1);
  if (info->idcol != NULL){
    fprintf(info->dev, "# id_col(s)\t\t");
    for (i = 0; i < info->idcolnum; i++){
      fprintf(info->dev, "%d", info->idcol[i] + 1);
      if (i < info->idcolnum - 1){
	fprintf(info->dev, ", ");
      }
      else {
	fprintf(info->dev, "\n");
      }
    }
  }
  if (info->chainmode == SOP){
    fprintf(info->dev, "# gap_costs\t\tsum-of-pair\n");
  }
  else {
    fprintf(info->dev, "# gap_costs\t\tlinear\n");	
  }
  #ifndef BINTREE
  fprintf(info->dev, "# priority_queue\tjohnson queue\n");
  //#ifdef VEB_DEFERRED
  //fprintf(info->dev, "# init_method\t\tdeferred\n");
  //#else
  //fprintf(info->dev, "# init_method\t\tnormal\n");
  //#endif
  #else
  fprintf(info->dev, "# priority_queue\t\tbinary search tree\n");
  //#ifdef BIN_DEFERRED
  //fprintf(info->dev, "# init_method\t\tdeferred\n");
  //#else
  //fprintf(info->dev, "# init_method\t\tnormal\n");
  //#endif
  #endif
  fprintf(info->dev, "# parameter\t\tlambda=%g\n", info->lambda);
  fprintf(info->dev, "# parameter\t\tepsilon=%g\n", info->epsilon);
  fprintf(info->dev, "# parameter\t\tmaxgap=%d\n", info->maxgap);
  fprintf(info->dev, "# parameter\t\tminscore=%g\n", info->minscore);
  fprintf(info->dev, "# parameter\t\tminfrag=%d\n", info->minfrag);
  fprintf(info->dev, "# colheader\n");
  if (info->outputorig){
    fprintf(info->dev, "# orig_fragments\n");
  }
  else {
    fprintf(info->dev, "# type\t");
    if (info->idcol != NULL){
      for (i = 0; i < info->idcolnum; i++){
	fprintf(info->dev, "id\t");
      }
    }
    fprintf(info->dev, "qry_st\tqry_end\tdb_st\tdb_end\tscore\n");
  }
}

/*-------------------------- bl_slmatchInitFromFile ----------------------------
 *    
 * @brief       initialize container with slmatch_t from tab-separated file
 * @author      Christian Otto
 *   
 */
void bl_slmatchInitFromFile(Container *fragments, Container *lines, Container *subject,
			    char *filename, char *delim, Uint *colorder,
			    Uint *idcol, int idcolnum){
  int num, *space = NULL;
  Uint i, j, len, rows = 0;
  char *ch, **file = NULL;
  stringset_t *content = NULL;
  char *field, *line, COMMENT = '#';
  BOOL found;
  slmatch_t frag;

  /* read file (empty lines already discarded) */
  file = readlines(filename, &rows);

  for (i = 0; i < rows; i++){
    content = tokensToStringset(space, delim, file[i], strlen(file[i]));
    free(file[i]);
    /* ignore comment lines */
    if (content->strings[0].str[0] == COMMENT){
      continue;
    }
    bl_slmatchInit(&frag, 0);
    /* insufficient number of cols (at least pos and scr) */
    if (content->noofstrings < 5){
      DBG("%s,%d: insufficient number of columns. Exit forced.\n",
	  filename, i + 1);
      exit(-1);
    }
    /* invalid column number given at init */
    for (j = 0; j < 5; j++){
      if (content->noofstrings <= colorder[j]){	
	DBG("%s,%d: defined column number '%d' not available. Exit forced.\n",
	    filename, i + 1, colorder[j] + 1);
	exit(-1);
      }
    }
    if (idcol != NULL){
      for (j = 0; j < idcolnum; j++){
	if (content->noofstrings <= idcol[j]){		
	  DBG("%s,%d: identifier column number '%d' not available. \
Exit forced.\n", filename, i + 1, idcol[j] + 1);
	  exit(-1);
	}
      }
    }
    /* database start */
    field = content->strings[colorder[2]].str;
    num = (int) strtol(field, &ch, 10);
    if (num < 0 || *ch != '\0'){ 
      DBG("%s,%d: invalid database start information in column %d. \
Exit forced.\n", filename, i + 1, colorder[2] + 1);
      exit(-1);
    }
    frag.p = num;
    /* database end */
    field = content->strings[colorder[3]].str;
    num = (int) strtol(field, &ch, 10);
    if (num < 0 || *ch != '\0'){ 
      DBG("%s,%d: invalid database end information in column %d. \
Exit forced.\n", filename, i + 1, colorder[3] + 1);
      exit(-1);
    }
    if (num < frag.p){
      DBG("%s,%d: file omitted due to database end before start.\n",
	  filename, i + 1);
      continue;
    }
    frag.q = num - frag.p + 1;
    /* query start */
    field = content->strings[colorder[0]].str;
    num = (int) strtol(field, &ch, 10);
    if (num < 0 || *ch != '\0'){ 
      DBG("%s,%d: invalid query start information in column %d. \
Exit forced.\n", filename, i + 1, colorder[0] + 1);
      exit(-1);
    }
    frag.i = num;
    /* query end */
    field = content->strings[colorder[1]].str;
    num = (int) strtol(field, &ch, 10);
    if (num < 0 || *ch != '\0'){ 
      DBG("%s,%d: invalid query end information in column %d. \
Exit forced.\n", filename, i + 1, colorder[1] + 1);
      exit(-1);
    }
    if (num < frag.i){
      DBG("%s,%d: file omitted due to query end before begin.\n",
	  filename, i + 1);
      continue;      
    }
    frag.j = num - frag.i + 1;
    /* score */
    field = content->strings[colorder[4]].str;
    frag.scr = strtod(field, &ch);
    if (frag.scr < 0 || *ch != '\0'){
      DBG("%s,%d: invalid score information in column %d. \
Exit forced.\n", filename, i + 1, colorder[4] + 1);
      exit(-1);
    }
    /* subject (optionally) */
    if (idcol != NULL){
      len = 0;
      field = NULL;
      for (j = 0; j < idcolnum; j++){	
	field = (char *) realloc(field, len + content->strings[idcol[j]].len + 1);
	memmove(field + len, content->strings[idcol[j]].str,
		content->strings[idcol[j]].len);
	len += content->strings[idcol[j]].len + 1;
	field[len - 1] = *delim;
      }
      field[len - 1] = '\0';
      found = 0;
      /* linear search (time expensive but expectably only few entries */
      for (j = 0; j < bl_containerSize(subject); j++){
	/* same subject found -> assign same id */
	if (strcmp(field, *(char **) bl_containerGet(subject, j)) == 0){
	  frag.subject = j;
	  found = 1;
	  break;
	}
      }
      /* not found -> insert subject at end */
      if (found == 0){
	bl_containerAdd(subject, &field);
	frag.subject = bl_containerSize(subject) - 1;
      }
      else {
	free(field);
      }
    }
    else {
      frag.subject = 0;
    }
    /* index of line in original input */
    frag.idx = bl_containerSize(fragments);

    bl_containerAdd(fragments, &frag);
    /* store entire line in Container lines */
    if (lines != NULL){
      len = 0;
      line = NULL;
      for (j = 0; j < content->noofstrings; j++){      
	line = (char *) realloc(line, len + content->strings[j].len + 1);
	memmove(line + len, content->strings[j].str,
		content->strings[j].len);
	len += content->strings[j].len + 1;  
	line[len - 1] = *delim;    
      }
      line[len - 1] = '\0';
      bl_containerAdd(lines, &line);
    }
    destructStringset(space, content);
  }
  free(file);
}
