生物技术

开发平台：
C/C++

splign_util.cpp：源码内容
							/*
 * ===========================================================================
 * PRODUCTION $Log: splign_util.cpp,v $
 * PRODUCTION Revision 1000.0  2004/06/01 18:13:08  gouriano
 * PRODUCTION PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.8
 * PRODUCTION
 * ===========================================================================
 */
/* $Id: splign_util.cpp,v 1000.0 2004/06/01 18:13:08 gouriano Exp $
* ===========================================================================
*
*                            PUBLIC DOMAIN NOTICE                          
*               National Center for Biotechnology Information
*                                                                          
*  This software/database is a "United States Government Work" under the   
*  terms of the United States Copyright Act.  It was written as part of    
*  the author's official duties as a United States Government employee and 
*  thus cannot be copyrighted.  This software/database is freely available 
*  to the public for use. The National Library of Medicine and the U.S.    
*  Government have not placed any restriction on its use or reproduction.  
*                                                                          
*  Although all reasonable efforts have been taken to ensure the accuracy  
*  and reliability of the software and data, the NLM and the U.S.          
*  Government do not and cannot warrant the performance or results that    
*  may be obtained by using this software or data. The NLM and the U.S.    
*  Government disclaim all warranties, express or implied, including       
*  warranties of performance, merchantability or fitness for any particular
*  purpose.                                                                
*                                                                          
*  Please cite the author in any work or product based on this material.   
*
* ===========================================================================
*
* Author:  Yuri Kapustin
*
* File Description:  Helper functions
*                   
* ===========================================================================
*/
#include <ncbi_pch.hpp>
#include "splign_util.hpp"
#include "splign_hitparser.hpp"
#include <algo/align/align_exception.hpp>
#include <algorithm>
#include <math.h>
BEGIN_NCBI_SCOPE
#ifdef GENOME_PIPELINE
CInfTable::CInfTable(size_t cols): m_cols(cols)
{
  m_data.resize(m_cols, 0);
}
size_t CInfTable::Load(const string& filename)
{
  ifstream ifs (filename.c_str());
  size_t read = 0;
  char line [1024];
  while(ifs) {
    line[0] = 0;
    ifs.getline(line, sizeof line, 'n');
    if(line[0]) {
      if(line[0] != '#') {
	if(!x_ReadColumns(line)) {
	  return 0;
	}
	string accession;
	SInfo info;
	bool parsed = x_ParseLine(line, accession, info);
	if(!parsed) {
	  return 0;
	}
	m_map[accession] = info;
	++read;
      }
    }
  }
  return read;
}
bool CInfTable::x_ReadColumns(char* line)
{
  char* p0 = line;
  char* pe = p0 + strlen(line);
  char* p = p0;
  for(size_t i = 0; i < m_cols; ++i) {
    p = find(p0, pe, 't');
    if(i+1 < m_cols && p == pe) {
      return false;
    }
    m_data[i] = p0;
    *p = 0;
    p0 = p + 1;
  }
  return true;
}
bool CInfTable::GetInfo(const string& id, SInfo& info) {
  map<string, SInfo>::const_iterator ii = m_map.find(id);
  if(ii != m_map.end()) {
    info = ii->second;
    return true;
  }
  return false;
}
bool CInf_mRna::x_ParseLine (const char* line,
			     string& accession, SInfo& info)
{
  if(!line || !*line) {
    return false;
  }
  accession = x_GetCol(0);
  const char* buf_size = x_GetCol(2);
  sscanf(buf_size, "%d", &info.m_size);
  const char* buf = x_GetCol(3);
  SInfo::EStrand strand = SInfo::ePlus;
  if(buf[0]=='m') {
    strand = SInfo::eMinus;
  }
  buf = x_GetCol(4);
  if(buf[0] != '-') {
    char c;
    sscanf(buf, "%d,%d%c",
	   &info.m_polya_start,
	   &info.m_polya_end,
	   &c);
    if(c == 'm') {
      strand = SInfo::eMinus;
    }
  }
  buf = x_GetCol(9);
  if(strcmp(buf, "wrong_strand") == 0) {
    strand = SInfo::eUnknown;
  }
  info.m_strand = strand;
  return true;
}
bool CInf_EST::x_ParseLine (const char* line, string& accession, SInfo& info)
{
  if(!line || !*line) {
    return false;
  }
  accession = x_GetCol(0);
  const char* buf = x_GetCol(3);
  switch(buf[0]) {
      case '5': info.m_strand = SInfo::ePlus;    break;
      case '3': info.m_strand = SInfo::eMinus;   break;
      default:  info.m_strand = SInfo::eUnknown; break;
  }
  buf = x_GetCol(6);
  if(buf[0] != '-') {
    char c;
    sscanf(buf, "%d,%d%c",
	   &info.m_polya_start,
	   &info.m_polya_end,
	   &c);
    if(c == 'm' && info.m_strand == SInfo::ePlus ||
       c == 'p' && info.m_strand == SInfo::eMinus) {
      info.m_strand = SInfo::eUnknown;
    }
  }
  return true;
}
#endif // GENOME_PIPELINE
bool IsConsensus(const char* donor, const char* acceptor)
{
  return donor && acceptor &&
    (donor[0] == 'G' && (donor[1] == 'C' || donor[1] == 'T'))
    &&
    (acceptor[0] == 'A' && acceptor[1] == 'G');
}
void CleaveOffByTail(vector<CHit>* hits, size_t polya_start)
{
  const size_t hit_dim = hits->size();
  vector<size_t> vdel;
  for(size_t i = 0; i < hit_dim; ++i) {
    CHit& hit = (*hits)[i];
    if(size_t(hit.m_ai[0]) >= polya_start) {
      vdel.push_back(i);
    }
    else if(size_t(hit.m_ai[1]) >= polya_start) {
      hit.Move(1, polya_start-1, false);
      if(hit.IsConsistent() == false) {
	vdel.push_back(i);
      }
    }
  }
  const size_t del_dim = vdel.size();
  if(del_dim > 0) {
    for( size_t i = 0, j = 0, k = 0; j < hit_dim; ) {
      if(k < del_dim && j == vdel[k]) {
	++k;
	++j;
      }
      else if(i < j) {
	(*hits)[i++] = (*hits)[j++];
      }
      else { // i == j
	++i; ++j;
      }
    }
    hits->resize(hit_dim - del_dim);
  }
}
void XFilter(vector<CHit>* hits) 
{
  if(hits->size() == 0) return;
  int group_id = 0;
  CHitParser hp (*hits, group_id);
  hp.m_Strand = CHitParser::eStrand_Both;
  hp.m_SameOrder = false;
  hp.m_Method = CHitParser::eMethod_MaxScore;
  hp.m_CombinationProximity_pre  = -1;
  hp.m_CombinationProximity_post = -1;
  hp.m_MinQueryCoverage = 0;
  hp.m_MinSubjCoverage = 0;
  hp.m_MaxHitDistQ = kMax_Int;
  hp.m_MaxHitDistS = kMax_Int;
  hp.m_Prot2Nucl = false;
  hp.m_SplitQuery = CHitParser::eSplitMode_Clear;
  hp.m_SplitSubject = CHitParser::eSplitMode_Clear;
  hp.m_group_identification_method = CHitParser::eNone;
  hp.m_Query = (*hits)[0].m_Query;
  hp.m_Subj =  (*hits)[0].m_Subj;
  
  hp.Run( CHitParser::eMode_Normal );
  *hits = hp.m_Out;
}
void GetHitsMinMax(const vector<CHit>& vh,
		   size_t* qmin, size_t* qmax,
		   size_t* smin, size_t* smax)
{
  const size_t hit_dim = vh.size();
  
  *qmin = kMax_UInt, *qmax = 0;
  *smin = *qmin, *smax = *qmax;
  
  for(size_t i = 0; i < hit_dim; ++i) {
    if(size_t(vh[i].m_ai[0]) < *qmin)
      *qmin = vh[i].m_ai[0];
    if(size_t(vh[i].m_ai[1]) > *qmax)
      *qmax = vh[i].m_ai[1];
    if(size_t(vh[i].m_ai[2]) < *smin)
      *smin = vh[i].m_ai[2];
    if(size_t(vh[i].m_ai[3]) > *smax)
      *smax = vh[i].m_ai[3];
  }
}
// apply run-length encoding
string RLE(const string& in)
{
  string out;
  const size_t dim = in.size();
  if(dim == 0) {
    return kEmptyStr;
  }
  const char* p = in.c_str();
  char c0 = p[0];
  out.append(1, c0);
  size_t count = 1;
  for(size_t k = 1; k < dim; ++k) {
    char c = p[k];
    if(c != c0) {
      c0 = c;
      if(count > 1) {
        out += NStr::IntToString(count);
      }
      count = 1;
      out.append(1, c0);
    }
    else {
      ++count;
    }
  }
  if(count > 1) {
    out += NStr::IntToString(count);
  }
  return out;
}
CNWAligner::TScore ScoreByTranscript(const CNWAligner& aligner,
				     const char* transcript)
{
  static const string kBadSymMsg = "Unknown symbol in transcript: ";
  const int Wg  = aligner.GetWg();
  const int Ws  = aligner.GetWs();
  const int Wm  = aligner.GetWm();
  const int Wms = aligner.GetWms();
  const size_t dim = strlen(transcript);
  if(dim == 0) return 0;
  CNWAligner::TScore score = 0;
  char state1, state2;
  switch(transcript[0]) {
    case 'M':
    case 'R':  state1 = state2 = 0;
    break;
    case 'I':  state1 = 1; state2 = 0; score += Wg;
    break;
    case 'D':  state1 = 0; state2 = 1; score += Wg;
    break;
    default:
      NCBI_THROW(CAlgoAlignException, eInternal, kBadSymMsg + transcript[0]);
  }
  for(size_t i = 0; i < dim; ++i) {
    switch(transcript[i]) {
      case 'M': {
        state1 = state2 = 0;
        score += Wm;
      }
      break;
      case 'R': {
        state1 = state2 = 0;
        score += Wms;
      }
      break;
      case 'I': {
        if(state1 != 1) score += Wg;
        state1 = 1; state2 = 0;
        score += Ws;
      }
      break;
      case 'D': {
        if(state2 != 1) score += Wg;
        state1 = 0; state2 = 1;
        score += Ws;
      }
      break;
      default:
        NCBI_THROW(CAlgoAlignException, eInternal, kBadSymMsg + transcript[i]);
    }
  }
  return score;
}
END_NCBI_SCOPE
/*
 * ===========================================================================
 *
 * $Log: splign_util.cpp,v $
 * Revision 1000.0  2004/06/01 18:13:08  gouriano
 * PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.8
 *
 * Revision 1.8  2004/05/24 16:13:57  gorelenk
 * Added PCH ncbi_pch.hpp
 *
 * Revision 1.7  2004/05/17 14:50:57  kapustin
 * Add/remove/rearrange some includes and object declarations
 *
 * Revision 1.6  2004/04/23 20:33:32  ucko
 * Fix remaining use of sprintf.
 *
 * Revision 1.5  2004/04/23 18:43:58  ucko
 * <cmath> -> <math.h>, since some older compilers (MIPSpro) lack the wrappers.
 *
 * Revision 1.4  2004/04/23 16:50:48  kapustin
 * sprintf->CNcbiOstrstream
 *
 * Revision 1.3  2004/04/23 14:37:44  kapustin
 * *** empty log message ***
 *
 *
 * ===========================================================================
 */