/********************************************************************************************************
 * QRNA - Comparative analysis of biological sequences 
 *         with pair hidden Markov models, pair stochastic context-free
 *        grammars, and probabilistic evolutionary  models.
 *       
 * Version 2.0.0 (JUN 2003)
 *
 * Copyright (C) 2000-2003 Howard Hughes Medical Institute/Washington University School of Medicine
 * All Rights Reserved
 * 
 *     This source code is distributed under the terms of the
 *     GNU General Public License. See the files COPYING and LICENSE
 *     for details.
 ***********************************************************************************************************/

/* rnaoutput.c
 * derived from COVE's konings.c
 * SRE, Sun Aug 28 10:39:18 1994
 * 
 * Representation of secondary structure and secondary structural 
 * alignments using a variant of Danielle Konings' string notation.
 * 
 * See: Konings and Hogeweg, J. Mol. Biol. 207:597-614 1989
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#include "funcs.h"
#include "globals.h"
#include "squid.h"
#include "structs.h"

#ifdef MEMDEBUG
#include "dbmalloc.h"
#endif


/* Function: CompareRNAStrutures()
 * 
 * ER, Tue Oct  9 15:19:38 CDT 2001 
 *
 * Purpose:  given 2 secondary structures, compare them
 *
 *
 * C coefficient: Matthews correction factor.
 *
 *   Matthews, BW (1975) 
 *   Comparison of the predicted and observed secondary structure of T4 phage Lysozyme.
 *   Biochem. Biophys. Acta, 405, 442-451.
 *
 *   Pt = true  positives = agree.
 *   Pf = false positives = pairs - agree.
 *
 *   Nt = ture  negatives = total unpaired in both sequences.
 *   Nf = false negatives = pairs_true - agree;
 *
 *
 *   C =( Pt*Nt - Pf*Nf) / sqrt[ (Nt+Nf) * (Nt+Pf) * (Pt+Nf) * (Pt+Pf) ]
 *
 *   under the approximations Nf/Nt -> 0 and Pf/Nt ->0 for N -> infty
 *                            Pt > 0 with at least Pt \sim Pf or Pt sim Nf
 *
 *   C_app = sqrt[ Pt/(Pt+Nf) * Pt/(Pt+Pf) ] (geometric mean of sensitivity and specifity)
 *
 */
void
CompareRNAStructures(FILE *ofp, int start, int L, char *ss_true, char *ss)
{
  int  *cc_true;
  int  *cc;
  int   i;
  float sen, spe;
  float c;                  /* Matthews coefficient             */
  float c_ap;               /* approximate Matthews coefficient */
  float agree = 0.;         /* agree      = Pt                  */
  float pairs = 0.;         /* pairs      = Pt + Pf             */
  float pairs_true = 0.;    /* pairs_true = Pt + Nf             */    
  float Pf, Pt;
  float Nf;
  float Nt = 0.;
 
  if (!KHS2ct(start+ss,      L, FALSE, &cc     )) Die ("Bad 'calculated' structure");
  if (!KHS2ct(start+ss_true, L, FALSE, &cc_true)) { Warn ("Bad 'given' structure"); free(cc); free(cc_true); return; }
 
  for (i = 0; i < L; i++) {
    if (cc_true[i] != i) pairs_true += 1.; else Nt += 1.;
    if (cc[i]      != i) pairs      += 1.; else Nt += 1.;

    if (cc_true[i] != i && cc_true[i] == cc[i]) agree += 1.;
  }

  Pt = agree;
  Pf = pairs      - agree;
  Nf = pairs_true - agree;

  if (Pt+Nf > 0.0) sen = 100.*Pt/(Pt+Nf); else sen = 0.0;
  if (Pt+Pf > 0.0) spe = 100.*Pt/(Pt+Pf); else spe = 0.0;

  if (Pt+Nf > 0.0 && Pt+Pf > 0.0 && Nt+Nf > 0.0 && Nt+Pf > 0.0) 
    c = 100.*( Pt*Nt - Pf*Nf) / sqrt( (Nt+Nf) * (Nt+Pf) * (Pt+Nf) * (Pt+Pf) ); /* not  very useful */
  
  c_ap = sqrt(sen * spe);
  

  fprintf(ofp, "true pairs %.1f\t found pairs %.1f\t agree %.1f (sen=%.2f spe=%.2f -- C_ap = %.2f)\n", 
	  pairs_true/2.0, pairs/2.0, agree/2.0, 
	  sen, spe, c_ap);
  
  if (((int)pairs%2 != 0) || ((int)pairs_true%2 != 0) || ((int)agree%2 != 0)) 
    Die("Error in CompareRNAStrutures(); odd number of paired nucleotides\n");

  if ( ((int)Nt%2 != 0) ) 
    Die("Error in CompareRNAStrutures(); odd number of total unpaired nucleotides\n");


  free(cc);
  free(cc_true);
}

void
CompareRNAParses(FILE *ofp, int start, int L, char *ss_true, char *ss)
{
  int  *cc_true;
  int  *cc;
  int   i;
  float agree = 0.;         /* agree      = Pt                  */
  float pairs = 0.;         /* pairs      = Pt + Pf             */
  float pairs_true = 0.;    /* pairs_true = Pt + Nf             */    
  float Nt = 0.;
 
  if (!KHS2ct(start+ss,      L, FALSE, &cc     )) Die  ("Bad 'calculated' structure");
  if (!KHS2ct(start+ss_true, L, FALSE, &cc_true)) Warn ("Bad 'given' structure");
 
  for (i = 0; i < L; i++) {
    if (cc_true[i] != i) pairs_true += 1.; else Nt += 1.;
    if (cc[i]      != i) pairs      += 1.; else Nt += 1.;

    if (cc_true[i] != i && cc_true[i] == cc[i]) agree += 1.;
  }

  fprintf(ofp, "Number of pairs %.1f\n", pairs_true/2.0);
  if (pairs_true != pairs) Die ("You have not traced back all the pairs in the given SS");
  
  if (((int)pairs%2 != 0) || ((int)pairs_true%2 != 0) || ((int)agree%2 != 0)) 
    Die("Error in CompareRNAStrutures(); odd number of paired nucleotides\n");

  if ( ((int)Nt%2 != 0) ) 
    Die("Error in CompareRNAStrutures(); odd number of total unpaired nucleotides\n");


  free(cc);
  free(cc_true);
}

void
Ct2KHS_ER(int start, int L, int *ct, char **ret_cc)
{
  char *cc;
  int   j, jabs;
  int   val;

  cc = (char *) MallocOrDie(sizeof(char) * L);

  for (j = 0; j < L; j++) cc[j] = '.';

  for (j = 0; j < L; j++) {
    jabs = j + start;
    val = ct[jabs];
    
    if (val != -1 && val > j) { cc[j] = '>'; cc[val] = '<'; }
  }
  
  *ret_cc = cc;
}


/* Function: IsRNAComplement()
 * 
 * Purpose:  Returns TRUE if sym1, sym2 are Watson-Crick complementary.
 *           If allow_gu is TRUE, GU pairs also return TRUE.
 */
int
IsRNAComplement(char sym1, char sym2, int allow_gu)
{
  sym1 = toupper(sym1);
  sym2 = toupper(sym2);

  if ((sym1 == 'A' && sym2 == 'U') ||
      (sym1 == 'C' && sym2 == 'G') ||
      (sym1 == 'G' && sym2 == 'C') ||
      (sym1 == 'U' && sym2 == 'A') ||
      (allow_gu && sym1 == 'G' && sym2 == 'U') ||
      (allow_gu && sym1 == 'U' && sym2 == 'G'))
    return TRUE;
  else
    return FALSE;
}

void
RNAPairs(SQINFO sqinfo, int *ss, int format, float *ret_pairs, float *ret_cykpairs)
{
  int i;
  float pairs = 0.;
  float cykpairs = 0.;

  for (i = 0; i < sqinfo.len; i++) {
    if ((format == kSquid || format == kSelex) && sqinfo.ss[i] != '.') 
      pairs += 1.;
    if (ss[i] != -1) 
      cykpairs += 1.;
  }

  *ret_pairs    = pairs/2.;
  *ret_cykpairs = cykpairs/2.;
}

/* Function: TraceRNA()
 * 
 * Purpose:  From a traceback tree of seq, produce a
 *           secondary structure string. ">" and "<" are
 *           used for pairwise emissions; "." for single-stranded stuff.
 *           Note that structure is defined by pairwise emissions,
 *           not by Watson-Crick-isms and stacking rules.
 *
 *           Constructed for the one-hole algorithm.
 *           
 * Args:     tr          - traceback structure
 *           seq         - sequence, 0..rlen-1
 *           rlen        - length of seq and returned ss string
 *           watsoncrick - TRUE to annotate canonical pairs only
 *           ret_ss      - RETURN: alloc'ed secondary structure string
 *
 * Return:   ret_ss contains a string 0..rlen-1 containing the
 *           secondary structure. Must be free'd by caller.
 */
void
TraceRNA(struct tracekn_s *tr, int rlen, int lend, int watsoncrick, char **ret_ss, char **ret_cc)  
{ 
  struct traceknstack_s *dolist;
  struct tracekn_s      *curr;
  char                   *cc;
  char                   *ss;
  int                     i;
  int                     num = -1;
 
  if ((ss = (char *) malloc (sizeof(char) * rlen+1)) == NULL)
    Die("malloc failed");
  if ((cc = (char *) malloc (sizeof(char) * rlen+1)) == NULL)
    Die("malloc failed");

  memset(cc, '.', rlen);
  cc[rlen] = '\0';
  memset(ss, '.', rlen);
  ss[rlen] = '\0';

  dolist = InitTraceknstack();
  PushTraceknstack(dolist, tr->nxtl);

  /* these positions are not scored as "RNA" state, so we want to distinguish them
   */
  for (i = 0; i < tr->nxtl->emiti+lend; i++) {
    cc[i] = '-';
    ss[i] = '-';
  }
  for (i = rlen-1; i > tr->nxtl->emitj+lend; i--) {
    cc[i] = '-';
    ss[i] = '-';
  }

  /* start first helix if any
   */ 
  if (tr->nxtl->type == dpcP || tr->nxtl->type == dpcS2S  || tr->nxtl->type == dpcS2B  || tr->nxtl->type == dpcS2I) num++;

  while ((curr = PopTraceknstack(dolist)) != NULL)
    {

      if (curr->type == dpcP || curr->type == dpcS1  || curr->type == dpcS2S  || curr->type == dpcS2B  || curr->type == dpcS2I || curr->type == dpcMV)
	{
	  ss[curr->emiti+lend] = '>';
	  ss[curr->emitj+lend] = '<';
	  
	  if (curr->node == 250) num ++;
	  cc[curr->emiti+lend] = SSAlphabet[num];
	  cc[curr->emitj+lend] = SSAlphabet[num];
	}
      
      if (curr->nxtl) {
	if (curr->nxtl->type == dpcP) {
	  curr->nxtl->node = 250; 
	}
	PushTraceknstack(dolist, curr->nxtl);
      }
      if (curr->nxtr) {
	if (curr->nxtr->type == dpcP) {
	  curr->nxtr->node = 250; 
	}
	PushTraceknstack(dolist, curr->nxtr); 
      }
      
    }
  
  FreeTraceknstack(dolist);
  
  *ret_cc = cc;
  *ret_ss = ss;
}

/* Function: Tracenrn()
 * 
 * Purpose:  From a traceback tree of seq, produce a
 *           secondary structure string. ">" and "<" are
 *           used for pairwise emissions; "." for single-stranded stuff.
 *           Note that structure is defined by pairwise emissions,
 *           not by Watson-Crick-isms and stacking rules.
 *
 *           Constructed for the one-hole algorithm.
 *           
 * Args:     tr          - traceback structure
 *           seq         - sequence, 0..rlen-1
 *           rlen        - length of seq and returned ss string
 *           watsoncrick - TRUE to annotate canonical pairs only
 *           ret_ss      - RETURN: alloc'ed secondary structure string
 *
 * Return:   ret_ss contains a string 0..rlen-1 containing the
 *           secondary structure. Must be free'd by caller.
 */
void
Tracenrn(struct tracekn_s *tr, int rlen, int lend, int watsoncrick, char **ret_ss, char **ret_cc)  
{ 
  struct traceknstack_s *dolist;
  struct tracekn_s      *curr;
  char                   *cc;
  char                   *ss;
  int                     i;
  int                     num = -1;
 
  if ((ss = (char *) malloc (sizeof(char) * rlen+1)) == NULL)
    Die("malloc failed");
  if ((cc = (char *) malloc (sizeof(char) * rlen+1)) == NULL)
    Die("malloc failed");

  memset(cc, '.', rlen);
  cc[rlen] = '\0';
  memset(ss, '.', rlen);
  ss[rlen] = '\0';

  dolist = InitTraceknstack();
  PushTraceknstack(dolist, tr->nxtl);

  /* these positions are not scored as "RNA" state, so we want to distinguish them
   */
  for (i = 0; i < tr->nxtl->emiti+lend; i++) {
    cc[i] = '-';
    ss[i] = '-';
  }
  for (i = rlen-1; i > tr->nxtl->emitj+lend; i--) {
    cc[i] = '-';
    ss[i] = '-';
  }

  /* start first helix if any
   */
  if (tr->nxtl->type == dpcPS || tr->nxtl->type == dpcPL) num++;

  while ((curr = PopTraceknstack(dolist)) != NULL)
    {
      /* bloody exceptions for all kind of bulges that do not increase the number of helices.
       *
       * don't ask how I figured them out!
       * of course, this is very grammar dependent.
       *
       */
      if (curr->node == V && curr->type == dpcBS && curr->prv->type == dpcPS && 
	  curr->nxtl && curr->nxtl->node == V && curr->nxtl->type == dpcRS &&
	  curr->nxtr && curr->nxtr->node == W && curr->nxtr->type == dpcPL) num --;

      if (curr->node == V && curr->type == dpcBS && curr->prv->type == dpcPS && 
	  curr->nxtl && curr->nxtl->node == V && curr->nxtl->type == dpcRS &&
	  curr->nxtr && curr->nxtr->node == W && curr->nxtr->type == dpcLL) num --;

      if (curr->node == V && curr->type == dpcBS && curr->prv->type == dpcPL && 
	  curr->nxtl && curr->nxtl->node == V && curr->nxtl->type == dpcRS &&
	  curr->nxtr && curr->nxtr->node == W && curr->nxtr->type == dpcPL) num --;

      if (curr->node == V && curr->type == dpcLS && 
	  (curr->prv->type == dpcPL || curr->prv->type == dpcPS) && 
	  curr->nxtl && curr->nxtl->node == W && 
	  (curr->nxtl->type == dpcPL || curr->nxtl->type == dpcLL)) num --;

      if (curr->type == dpcPS ||  curr->type == dpcPL)
	{
	  ss[curr->emiti+lend] = '>';
	  ss[curr->emitj+lend] = '<';
	  
	  if (curr->node == 250) num ++;
	  cc[curr->emiti+lend] = SSAlphabet[num];
	  cc[curr->emitj+lend] = SSAlphabet[num];
	}
	
	if (curr->nxtl) { 
	  if ((curr->type != dpcPS && curr->type != dpcPL) && (curr->nxtl->type == dpcPS || curr->nxtl->type == dpcPL)) {
	    curr->nxtl->node = 250; 
	  }
	  PushTraceknstack(dolist, curr->nxtl); 
	}
	
	if (curr->nxtr) { 
	  if ((curr->type != dpcPS && curr->type != dpcPL) && (curr->nxtr->type == dpcPS || curr->nxtr->type == dpcPL)) {
	    curr->nxtr->node = 250; 
	  }

	  PushTraceknstack(dolist, curr->nxtr); 
	}
     }
  FreeTracekn(curr);
  FreeTraceknstack(dolist);

  *ret_cc = cc;
  *ret_ss = ss;
}























