生物技术

开发平台：
C/C++

nw_aligner.cpp：源码内容
							/*
 * ===========================================================================
 * PRODUCTION $Log: nw_aligner.cpp,v $
 * PRODUCTION Revision 1000.3  2004/06/01 18:04:52  gouriano
 * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.48
 * PRODUCTION
 * ===========================================================================
 */
/* $Id: nw_aligner.cpp,v 1000.3 2004/06/01 18:04:52 gouriano Exp $
 * ===========================================================================
 *
 *                            PUBLIC DOMAIN NOTICE                          
 *               National Center for Biotechnology Information
 *                                                                          
 *  This software/database is a "United States Government Work" under the   
 *  terms of the United States Copyright Act.  It was written as part of    
 *  the author's official duties as a United States Government employee and 
 *  thus cannot be copyrighted.  This software/database is freely available 
 *  to the public for use. The National Library of Medicine and the U.S.    
 *  Government have not placed any restriction on its use or reproduction.  
 *                                                                          
 *  Although all reasonable efforts have been taken to ensure the accuracy  
 *  and reliability of the software and data, the NLM and the U.S.           
 *  Government do not and cannot warrant the performance or results that    
 *  may be obtained by using this software or data. The NLM and the U.S.    
 *  Government disclaim all warranties, express or implied, including       
 *  warranties of performance, merchantability or fitness for any particular
 *  purpose.                                                                
 *                                                                          
 *  Please cite the author in any work or product based on this material.   
 *
 * ===========================================================================
 *
 * Authors:  Yuri Kapustin, Boris Kiryutin
 *
 * File Description:  CNWAligner implementation
 *                   
 * ===========================================================================
 *
 */
#include <ncbi_pch.hpp>
#include <algo/align/nw_aligner.hpp>
#include <algo/align/align_exception.hpp>
BEGIN_NCBI_SCOPE
// IUPACna alphabet (default)
const char g_nwaligner_nucleotides [] = "AGTCBDHKMNRSVWY";
CNWAligner::CNWAligner()
    : m_Wm(GetDefaultWm()),
      m_Wms(GetDefaultWms()),
      m_Wg(GetDefaultWg()),
      m_Ws(GetDefaultWs()),
      m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false),
      m_abc(g_nwaligner_nucleotides),
      m_prg_callback(0),
      m_Seq1(0), m_SeqLen1(0),
      m_Seq2(0), m_SeqLen2(0),
      m_score(kInfMinus)
{
    SetScoreMatrix(0);
}
CNWAligner::CNWAligner( const char* seq1, size_t len1,
                        const char* seq2, size_t len2,
                        const SNCBIPackedScoreMatrix* scoremat )
    : m_Wm(GetDefaultWm()),
      m_Wms(GetDefaultWms()),
      m_Wg(GetDefaultWg()),
      m_Ws(GetDefaultWs()),
      m_esf_L1(false), m_esf_R1(false), m_esf_L2(false), m_esf_R2(false),
      m_abc(g_nwaligner_nucleotides),
      m_prg_callback(0),
      m_Seq1(seq1), m_SeqLen1(len1),
      m_Seq2(seq2), m_SeqLen2(len2),
      m_score(kInfMinus)
{
    SetScoreMatrix(scoremat);
    SetSequences(seq1, len1, seq2, len2);
}
void CNWAligner::SetSequences(const char* seq1, size_t len1,
			      const char* seq2, size_t len2,
			      bool verify)
{
    if(!seq1 || !seq2) {
        NCBI_THROW(CAlgoAlignException, eBadParameter,
                   "NULL sequence pointer(s) passed");
    }
    if(verify) {
        size_t iErrPos1 = x_CheckSequence(seq1, len1);
	if(iErrPos1 < len1) {
	    ostrstream oss;
	    oss << "The first sequence is inconsistent with the current "
		<< "scoring matrix type. Symbol " << seq1[iErrPos1] << " at "
		<< iErrPos1;
	    string message = CNcbiOstrstreamToString(oss);
	    NCBI_THROW(CAlgoAlignException, eInvalidCharacter, message);
	}
	size_t iErrPos2 = x_CheckSequence(seq2, len2);
	if(iErrPos2 < len2) {
	    ostrstream oss;
	    oss << "The second sequence is inconsistent with the current "
		<< "scoring matrix type. Symbol " << seq2[iErrPos2] << " at "
		<< iErrPos2;
	    string message = CNcbiOstrstreamToString(oss);
	    NCBI_THROW(CAlgoAlignException, eInvalidCharacter, message);
	}
    }
    m_Seq1 = seq1;
    m_SeqLen1 = len1;
    m_Seq2 = seq2;
    m_SeqLen2 = len2;
    m_Transcript.clear();
}
void CNWAligner::SetEndSpaceFree(bool Left1, bool Right1,
                                 bool Left2, bool Right2)
{
    m_esf_L1 = Left1;
    m_esf_R1 = Right1;
    m_esf_L2 = Left2;
    m_esf_R2 = Right2;
}
// evaluate score for each possible alignment;
// fill out backtrace matrix
// bit coding (four bits per value): D E Ec Fc
// D:  1 if diagonal; 0 - otherwise
// E:  1 if space in 1st sequence; 0 if space in 2nd sequence
// Ec: 1 if gap in 1st sequence was extended; 0 if it is was opened
// Fc: 1 if gap in 2nd sequence was extended; 0 if it is was opened
//
const unsigned char kMaskFc  = 0x0001;
const unsigned char kMaskEc  = 0x0002;
const unsigned char kMaskE   = 0x0004;
const unsigned char kMaskD   = 0x0008;
CNWAligner::TScore CNWAligner::x_Align(const char* seg1, size_t len1,
                                       const char* seg2, size_t len2,
                                       vector<ETranscriptSymbol>* transcript)
{
    const size_t N1 = len1 + 1;
    const size_t N2 = len2 + 1;
    vector<TScore> stl_rowV (N2), stl_rowF(N2);
    TScore* rowV    = &stl_rowV[0];
    TScore* rowF    = &stl_rowF[0];
    TScore* pV = rowV - 1;
    const char* seq1 = seg1 - 1;
    const char* seq2 = seg2 - 1;
    const TNCBIScore (*sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;
    m_terminate = false;
    if(m_prg_callback) {
        m_prg_info.m_iter_total = N1*N2;
        m_prg_info.m_iter_done = 0;
        if(m_terminate = m_prg_callback(&m_prg_info)) {
	  return 0;
	}
    }
    bool bFreeGapLeft1  = m_esf_L1 && seg1 == m_Seq1;
    bool bFreeGapRight1 = m_esf_R1 && m_Seq1 + m_SeqLen1 - len1 == seg1;
    bool bFreeGapLeft2  = m_esf_L2 && seg2 == m_Seq2;
    bool bFreeGapRight2 = m_esf_R2 && m_Seq2 + m_SeqLen2 - len2 == seg2;
    TScore wgleft1   = bFreeGapLeft1? 0: m_Wg;
    TScore wsleft1   = bFreeGapLeft1? 0: m_Ws;
    TScore wg1 = m_Wg, ws1 = m_Ws;
    // index calculation: [i,j] = i*n2 + j
    vector<unsigned char> stl_bm (N1*N2);
    unsigned char* backtrace_matrix = &stl_bm[0];
    // first row
    size_t k;
    rowV[0] = wgleft1;
    for (k = 1; k < N2; k++) {
        rowV[k] = pV[k] + wsleft1;
        rowF[k] = kInfMinus;
        backtrace_matrix[k] = kMaskE | kMaskEc;
    }
    rowV[0] = 0;
	
    if(m_prg_callback) {
        m_prg_info.m_iter_done = k;
        m_terminate = m_prg_callback(&m_prg_info);
    }
    // recurrences
    TScore wgleft2   = bFreeGapLeft2? 0: m_Wg;
    TScore wsleft2   = bFreeGapLeft2? 0: m_Ws;
    TScore V  = rowV[N2 - 1];
    TScore V0 = wgleft2;
    TScore E, G, n0;
    unsigned char tracer;
    size_t i, j;
    for(i = 1;  i < N1 && !m_terminate;  ++i) {
        
        V = V0 += wsleft2;
        E = kInfMinus;
        backtrace_matrix[k++] = kMaskFc;
        unsigned char ci = seq1[i];
        if(i == N1 - 1 && bFreeGapRight1) {
                wg1 = ws1 = 0;
        }
        TScore wg2 = m_Wg, ws2 = m_Ws;
        for (j = 1; j < N2; ++j, ++k) {
            G = pV[j] + sm[ci][(unsigned char)seq2[j]];
            pV[j] = V;
            n0 = V + wg1;
            if(E >= n0) {
                E += ws1;      // continue the gap
                tracer = kMaskEc;
            }
            else {
                E = n0 + ws1;  // open a new gap
                tracer = 0;
            }
            if(j == N2 - 1 && bFreeGapRight2) {
                wg2 = ws2 = 0;
            }
            n0 = rowV[j] + wg2;
            if(rowF[j] >= n0) {
                rowF[j] += ws2;
                tracer |= kMaskFc;
            }
            else {
                rowF[j] = n0 + ws2;
            }
            if (E >= rowF[j]) {
                if(E >= G) {
                    V = E;
                    tracer |= kMaskE;
                }
                else {
                    V = G;
                    tracer |= kMaskD;
                }
            } else {
                if(rowF[j] >= G) {
                    V = rowF[j];
                }
                else {
                    V = G;
                    tracer |= kMaskD;
                }
            }
            backtrace_matrix[k] = tracer;
        }
        pV[j] = V;
        if(m_prg_callback) {
            m_prg_info.m_iter_done = k;
            if(m_terminate = m_prg_callback(&m_prg_info)) {
                break;
            }
        }
       
    }
    if(!m_terminate) {
        x_DoBackTrace(backtrace_matrix, N1, N2, transcript);
    }
    return V;
}
CNWAligner::TScore CNWAligner::Run()
{
    if(!m_Seq1 || !m_Seq2) {
        NCBI_THROW(CAlgoAlignException, eNoData,
                   "Sequence data not available for one or both sequences");
    }
    if(!x_CheckMemoryLimit()) {
        NCBI_THROW(CAlgoAlignException, eMemoryLimit, "Out of space");
    }
    m_score = x_Run();
    return m_score;
}
CNWAligner::TScore CNWAligner::x_Run()
{
    try {
    
        if(m_guides.size() == 0) {
            m_score = x_Align(m_Seq1,m_SeqLen1, m_Seq2,m_SeqLen2,
                              &m_Transcript);
        }
        else {
            
            // run the algorithm for every segment between hits
            m_Transcript.clear();
            size_t guides_dim = m_guides.size() / 4;
            size_t q1 = m_SeqLen1, q0, s1 = m_SeqLen2, s0;
            vector<ETranscriptSymbol> trans;
            
            // save original esf settings
            bool esf_L1 = m_esf_L1, esf_R1 = m_esf_R1,
                 esf_L2 = m_esf_L2, esf_R2 = m_esf_R2;
            SetEndSpaceFree(false, esf_R1, false, esf_R2); // last region
            for(size_t istart = 4*guides_dim, i = istart; i != 0; i -= 4) {
                q0 = m_guides[i - 3] + 1;
                s0 = m_guides[i - 1] + 1;
                size_t dim_query = q1 - q0, dim_subj = s1 - s0;
                x_Align(m_Seq1 + q0, dim_query, m_Seq2 + s0, dim_subj, &trans);
                copy(trans.begin(), trans.end(), back_inserter(m_Transcript));
                size_t dim_hit = m_guides[i - 3] - m_guides[i - 4] + 1;
                for(size_t k = 0; k < dim_hit; ++k) {
                    m_Transcript.push_back(eTS_Match);
                }
                q1 = m_guides[i - 4];
                s1 = m_guides[i - 2];
                trans.clear();
                if(i == istart) {  // middle regions
                    SetEndSpaceFree(false, false, false, false);
                }
            }
            SetEndSpaceFree(esf_L1, false, esf_L2, false); // first region
            x_Align(m_Seq1, q1, m_Seq2, s1, &trans);
            SetEndSpaceFree(esf_L1, esf_R1, esf_L2, esf_R2); // restore
            copy(trans.begin(), trans.end(), back_inserter(m_Transcript));
            m_score = x_ScoreByTranscript();
        }
    }
    
    catch( bad_alloc& ) {
        
        NCBI_THROW(CAlgoAlignException, eMemoryLimit, "Memory limit exceeded");
    }
    
    return m_score;
}
// perform backtrace step;
void CNWAligner::x_DoBackTrace(const unsigned char* backtrace,
                               size_t N1, size_t N2,
                               vector<ETranscriptSymbol>* transcript)
{
    transcript->clear();
    transcript->reserve(N1 + N2);
    size_t k = N1*N2 - 1;
    while (k != 0) {
        unsigned char Key = backtrace[k];
        if (Key & kMaskD) {
            transcript->push_back(eTS_Match);
            k -= N2 + 1;
        }
        else if (Key & kMaskE) {
            transcript->push_back(eTS_Insert); --k;
            while(k > 0 && (Key & kMaskEc)) {
                transcript->push_back(eTS_Insert);
                Key = backtrace[k--];
            }
        }
        else {
            transcript->push_back(eTS_Delete);
            k -= N2;
            while(k > 0 && (Key & kMaskFc)) {
                transcript->push_back(eTS_Delete);
                Key = backtrace[k];
                k -= N2;
            }
        }
    }
}
void  CNWAligner::SetPattern(const vector<size_t>& guides)
{
    size_t dim = guides.size();
    const char* err = 0;
    if(dim % 4 == 0) {
        for(size_t i = 0; i < dim; i += 4) {
            if( guides[i] > guides[i+1] || guides[i+2] > guides[i+3] ) {
                err = "Pattern hits must be specified in plus strand";
		break;
            }
            if(i > 4) {
                if(guides[i] <= guides[i-3] || guides[i+2] <= guides[i-2]){
                    err = "Pattern hits coordinates must be sorted";
                    break;
                }
            }
            size_t dim1 = guides[i + 1] - guides[i];
            size_t dim2 = guides[i + 3] - guides[i + 2];
            if( dim1 != dim2) {
                err = "Pattern hits must have equal length on both sequences";
                break;
            }
            if(guides[i+1] >= m_SeqLen1 || guides[i+3] >= m_SeqLen2) {
                err = "One or several pattern hits are out of range";
                break;
            }
        }
    }
    else {
        err = "Pattern must have a dimension multiple of four";
    }
    if(err) {
        NCBI_THROW(CAlgoAlignException, eBadParameter, err);
    }
    else {
        m_guides = guides;
    }
}
void CNWAligner::GetEndSpaceFree(bool* L1, bool* R1, bool* L2, bool* R2) const
{
    if(L1) *L1 = m_esf_L1;
    if(R1) *R1 = m_esf_R1;
    if(L2) *L2 = m_esf_L2;
    if(R2) *R2 = m_esf_R2;
}
// Return transcript as a readable string
string CNWAligner::GetTranscriptString(void) const
{
    const size_t dim = m_Transcript.size();   
    string s;
    s.resize(dim);
    size_t i1 = 0, i2 = 0, i = 0;
    for (int k = dim - 1; k >= 0;  --k) {
        ETranscriptSymbol c0 = m_Transcript[k];
        char c = 0;
        switch ( c0 ) {
            case eTS_Match: 
            case eTS_Replace: {
                c = (m_Seq1[i1++] == m_Seq2[i2++])? 'M': 'R';
            }
            break;
            case eTS_Insert: {
                c = 'I';
                ++i2;
            }
            break;
            case eTS_Delete: {
                c = 'D';
                ++i1;
            }
            break;
            case eTS_Intron: {
                c = '+';
                ++i2;
            }
            break;
	    default: {
	      NCBI_THROW(CAlgoAlignException, eInternal,
                         "Unexpected transcript symbol");
	    }
	  
        }
        s[i++] = c;
    }
    if(i < s.size()) {
        s.resize(i + 1);
    }
    return s;
}
void CNWAligner::SetProgressCallback ( FProgressCallback prg_callback,
				       void* data )
{
    m_prg_callback = prg_callback;
    m_prg_info.m_data = data;
}
 
void CNWAligner::SetScoreMatrix(const SNCBIPackedScoreMatrix* psm)
{
    if(psm) {
        m_abc = psm->symbols;
	NCBISM_Unpack(psm, &m_ScoreMatrix);
    }
    else { // assume IUPACna
        m_abc = g_nwaligner_nucleotides;
	memset(&m_ScoreMatrix.s, m_Wms, sizeof m_ScoreMatrix.s);
	// set diag to Wm except ambiguity symbols
        unsigned char c1, c2;
	const size_t kNuclSize = strlen(g_nwaligner_nucleotides);
	for(size_t i = 0; i < kNuclSize; ++i) {
	    c1 = g_nwaligner_nucleotides[i];
	    for(size_t j = 0; j < i; ++j) {
	        c2 = g_nwaligner_nucleotides[j];
		m_ScoreMatrix.s[c1][c2] = m_ScoreMatrix.s[c2][c1] = m_Wms;
	    }
	    m_ScoreMatrix.s[c1][c1] = i < 4? m_Wm: m_Wms;
	}
    }
}
// Check that all characters in sequence are valid for 
// the current sequence type.
// Return an index to the first invalid character
// or len if all characters are valid.
size_t CNWAligner::x_CheckSequence(const char* seq, size_t len) const
{
    char Flags [256];
    memset(Flags, 0, sizeof Flags);
    const size_t abc_size = strlen(m_abc);
    
    size_t k;
    for(k = 0; k < abc_size; ++k) {
        Flags[unsigned(m_abc[k])] = 1;
    }
    for(k = 0; k < len; ++k) {
        if(Flags[unsigned(seq[k])] == 0)
            break;
    }
    return k;
}
CNWAligner::TScore CNWAligner::GetScore() const 
{
  if(m_Transcript.size()) {
      return m_score;
  }
  else {
    NCBI_THROW(CAlgoAlignException, eNoData,
               "Sequences not aligned yet");
  }
}
bool CNWAligner::x_CheckMemoryLimit()
{
    const size_t elem_size = GetElemSize();
    const size_t gdim = m_guides.size();
    const size_t max_mem = kMax_UInt / 2;
    if(gdim) {
        size_t dim1 = m_guides[0], dim2 = m_guides[2];
        double mem = double(dim1)*dim2*elem_size;
        if(mem >= max_mem) {
            return false;
        }
        for(size_t i = 4; i < gdim; i += 4) {
            dim1 = m_guides[i] - m_guides[i-3] + 1;
            dim2 = m_guides[i + 2] - m_guides[i-1] + 1;
            mem = double(dim1)*dim2*elem_size;
            if(mem >= max_mem) {
                return false;
            }
        }
        dim1 = m_SeqLen1 - m_guides[gdim-3];
        dim2 = m_SeqLen2 - m_guides[gdim-1];
        mem = double(dim1)*dim2*elem_size;
        if(mem >= max_mem) {
            return false;
        }
        return true;
    }
    else {
        double mem = double(m_SeqLen1 + 1)*(m_SeqLen2 + 1)*elem_size;
        return mem < max_mem;
    }
}
CNWAligner::TScore CNWAligner::x_ScoreByTranscript() const
{
    const size_t dim = m_Transcript.size();
    if(dim == 0) return 0;
    vector<ETranscriptSymbol> transcript (dim);
    for(size_t i = 0; i < dim; ++i) {
        transcript[i] = m_Transcript[dim - i - 1];
    }
    TScore score = 0;
    const char* p1 = m_Seq1;
    const char* p2 = m_Seq2;
    int state1;   // 0 = normal, 1 = gap, 2 = intron
    int state2;   // 0 = normal, 1 = gap
    switch(transcript[0]) {
    case eTS_Match:    state1 = state2 = 0; break;
    case eTS_Insert:   state1 = 1; state2 = 0; score += m_Wg; break;
    case eTS_Delete:   state1 = 0; state2 = 1; score += m_Wg; break;
    default: {
        NCBI_THROW(CAlgoAlignException, eInternal,
                   "Invalid transcript symbol");
        }
    }
    const TNCBIScore (*sm) [NCBI_FSM_DIM] = m_ScoreMatrix.s;
    for(size_t i = 0; i < dim; ++i) {
        unsigned char c1 = m_Seq1? *p1: 'N';
        unsigned char c2 = m_Seq2? *p2: 'N';
        switch(transcript[i]) {
        case eTS_Match: {
            state1 = state2 = 0;
            score += sm[c1][c2];
            ++p1; ++p2;
        }
        break;
        case eTS_Insert: {
            if(state1 != 1) score += m_Wg;
            state1 = 1; state2 = 0;
            score += m_Ws;
            ++p2;
        }
        break;
        case eTS_Delete: {
            if(state2 != 1) score += m_Wg;
            state1 = 0; state2 = 1;
            score += m_Ws;
            ++p1;
        }
        break;
        
        default: {
        NCBI_THROW(CAlgoAlignException, eInternal,
                   "Invalid transcript symbol");
        }
        }
    }
    if(m_esf_L1) {
        size_t g = 0;
        for(size_t i = 0; i < dim; ++i) {
            if(transcript[i] == eTS_Insert) ++g; else break;
        }
        if(g > 0) {
            score -= (m_Wg + g*m_Ws);
        }
    }
    if(m_esf_L2) {
        size_t g = 0;
        for(size_t i = 0; i < dim; ++i) {
            if(transcript[i] == eTS_Delete) ++g; else break;
        }
        if(g > 0) {
            score -= (m_Wg + g*m_Ws);
        }
    }
    if(m_esf_R1) {
        size_t g = 0;
        for(int i = dim - 1; i >= 0; --i) {
            if(transcript[i] == eTS_Insert) ++g; else break;
        }
        if(g > 0) {
            score -= (m_Wg + g*m_Ws);
        }
    }
    if(m_esf_R2) {
        size_t g = 0;
        for(int i = dim - 1; i >= 0; --i) {
            if(transcript[i] == eTS_Delete) ++g; else break;
        }
        if(g > 0) {
            score -= (m_Wg + g*m_Ws);
        }
    }
    return score;
}
size_t CNWAligner::GetLeftSeg(size_t* q0, size_t* q1,
                              size_t* s0, size_t* s1,
                              size_t min_size) const
{
    size_t trdim = m_Transcript.size();
    size_t cur = 0, maxseg = 0;
    const char* p1 = m_Seq1;
    const char* p2 = m_Seq2;
    size_t i0 = 0, j0 = 0, imax = i0, jmax = j0;
    for(int k = trdim - 1; k >= 0; --k) {
        switch(m_Transcript[k]) {
            case eTS_Insert: {
                ++p2;
                if(cur > maxseg) {
                    maxseg = cur;
                    imax = i0;
                    jmax = j0;
                    if(maxseg >= min_size) goto ret_point;
                }
                cur = 0;
            }
            break;
            case eTS_Delete: {
            ++p1;
            if(cur > maxseg) {
                maxseg = cur;
                imax = i0;
                jmax = j0;
                if(maxseg >= min_size) goto ret_point;
            }
            cur = 0;
            }
            break;
            case eTS_Match:
            case eTS_Replace: {
                if(*p1 == *p2) {
                    if(cur == 0) {
                        i0 = p1 - m_Seq1;
                        j0 = p2 - m_Seq2;
                    }
                    ++cur;
                }
                else {
                    if(cur > maxseg) {
                        maxseg = cur;
                        imax = i0;
                        jmax = j0;
                        if(maxseg >= min_size) goto ret_point;
                    }
                    cur = 0;
                }
                ++p1;
                ++p2;
            }
            break;
            default: {
                NCBI_THROW(CAlgoAlignException, eInternal,
                           "Invalid transcript symbol");
            }
        }
    }
    if(cur > maxseg) {
      maxseg = cur;
      imax = i0;
      jmax = j0;
    }
 ret_point:
    *q0 = imax; *s0 = jmax;
    *q1 = *q0 + maxseg - 1;
    *s1 = *s0 + maxseg - 1;
    return maxseg;
}
size_t CNWAligner::GetRightSeg(size_t* q0, size_t* q1,
                               size_t* s0, size_t* s1,
                               size_t min_size) const
{
    size_t trdim = m_Transcript.size();
    size_t cur = 0, maxseg = 0;
    const char* seq1_end = m_Seq1 + m_SeqLen1;
    const char* seq2_end = m_Seq2 + m_SeqLen2;
    const char* p1 = seq1_end - 1;
    const char* p2 = seq2_end - 1;
    size_t i0 = m_SeqLen1 - 1, j0 = m_SeqLen2 - 1,
           imax = i0, jmax = j0;
    for(size_t k = 0; k < trdim; ++k) {
        switch(m_Transcript[k]) {
            case eTS_Insert: {
                --p2;
                if(cur > maxseg) {
                    maxseg = cur;
                    imax = i0;
                    jmax = j0;
                    if(maxseg >= min_size) goto ret_point;
                }
                cur = 0;
            }
            break;
            case eTS_Delete: {
                --p1;
                if(cur > maxseg) {
                    maxseg = cur;
                    imax = i0;
                    jmax = j0;
                    if(maxseg >= min_size) goto ret_point;
		}
                cur = 0;
            }
            break;
            case eTS_Match:
            case eTS_Replace: {
                if(*p1 == *p2) {
                    if(cur == 0) {
                        i0 = p1 - m_Seq1;
                        j0 = p2 - m_Seq2;
                    }
                    ++cur;
                }
                else {
                    if(cur > maxseg) {
                        maxseg = cur;
                        imax = i0;
                        jmax = j0;
                        if(maxseg >= min_size) goto ret_point;
                    }
                    cur = 0;
                }
                --p1;
                --p2;
            }
            break;
            default: {
                NCBI_THROW(CAlgoAlignException, eInternal,
                           "Invalid transcript symbol");
            }
        }
    }
    if(cur > maxseg) {
      maxseg = cur;
      imax = i0;
      jmax = j0;
    }
 ret_point:
    *q1 = imax; *s1 = jmax;
    *q0 = imax - maxseg + 1;
    *s0 = jmax - maxseg + 1;
    return maxseg;
}
size_t CNWAligner::GetLongestSeg(size_t* q0, size_t* q1,
                                 size_t* s0, size_t* s1) const
{
    size_t trdim = m_Transcript.size();
    size_t cur = 0, maxseg = 0;
    const char* p1 = m_Seq1;
    const char* p2 = m_Seq2;
    size_t i0 = 0, j0 = 0, imax = i0, jmax = j0;
    for(int k = trdim-1; k >= 0; --k) {
        switch(m_Transcript[k]) {
            case eTS_Insert: {
                ++p2;
                if(cur > maxseg) {
                    maxseg = cur;
                    imax = i0;
                    jmax = j0;
                }
                cur = 0;
            }
            break;
            case eTS_Delete: {
            ++p1;
            if(cur > maxseg) {
                maxseg = cur;
                imax = i0;
                jmax = j0;
            }
            cur = 0;
            }
            break;
            case eTS_Match:
            case eTS_Replace: {
                if(*p1 == *p2) {
                    if(cur == 0) {
                        i0 = p1 - m_Seq1;
                        j0 = p2 - m_Seq2;
                    }
                    ++cur;
                }
                else {
		    if(cur > maxseg) {
                        maxseg = cur;
                        imax = i0;
                        jmax = j0;
                    }
                    cur = 0;
                }
                ++p1;
                ++p2;
            }
            break;
            default: {
                NCBI_THROW(CAlgoAlignException, eInternal,
                           "Invalid transcript symbol");
            }
        }
    }
    
    if(cur > maxseg) {
        maxseg = cur;
        imax = i0;
        jmax = j0;
    }
    
    *q0 = imax; *s0 = jmax;
    *q1 = *q0 + maxseg - 1;
    *s1 = *s0 + maxseg - 1;
    return maxseg;
}
END_NCBI_SCOPE
/*
 * ===========================================================================
 * $Log: nw_aligner.cpp,v $
 * Revision 1000.3  2004/06/01 18:04:52  gouriano
 * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.48
 *
 * Revision 1.48  2004/05/21 21:41:02  gorelenk
 * Added PCH ncbi_pch.hpp
 *
 * Revision 1.47  2004/05/19 13:37:47  kapustin
 * Remove test dumping code
 *
 * Revision 1.46  2004/05/18 21:43:40  kapustin
 * Code cleanup
 *
 * Revision 1.45  2004/05/17 14:50:56  kapustin
 * Add/remove/rearrange some includes and object declarations
 *
 * Revision 1.44  2004/04/23 14:39:47  kapustin
 * Add Splign library and other changes
 *
 * Revision 1.42  2003/12/29 13:03:48  kapustin
 * Return string from GetTranscriptString().
 *
 * Revision 1.41  2003/10/31 19:40:13  kapustin
 * Get rid of some WS and GCC complains
 *
 * Revision 1.40  2003/10/27 20:47:35  kapustin
 * Minor code cleanup
 *
 * Revision 1.39  2003/10/14 18:44:46  kapustin
 * Adjust esf flags during pattern-guided alignment
 *
 * Revision 1.38  2003/09/30 19:50:04  kapustin
 * Make use of standard score matrix interface
 *
 * Revision 1.37  2003/09/26 14:43:18  kapustin
 * Remove exception specifications
 *
 * Revision 1.36  2003/09/15 20:49:11  kapustin
 * Clear the transcript when setting new sequences
 *
 * Revision 1.35  2003/09/04 16:07:38  kapustin
 * Use STL vectors for exception-safe dynamic arrays and matrices
 *
 * Revision 1.34  2003/09/03 17:28:40  kapustin
 * Clean the list of includes
 *
 * Revision 1.33  2003/09/02 22:38:18  kapustin
 * Move format functionality out of the class
 *
 * Revision 1.32  2003/07/09 15:11:22  kapustin
 * Update plain text output with seq ids and use inequalities instead of
 * binary operation to verify intron boundaries
 *
 * Revision 1.31  2003/06/26 20:39:39  kapustin
 * Rename formal parameters in SetEndSpaceFree() to avoid conflict with macro
 * under some configurations
 *
 * Revision 1.30  2003/06/17 17:20:44  kapustin
 * CNWAlignerException -> CAlgoAlignException
 *
 * Revision 1.29  2003/06/17 16:06:13  kapustin
 * Detect all variety of splices in Seq-Align formatter (restored from 1.27)
 *
 * Revision 1.28  2003/06/17 14:51:04  dicuccio
 * Fixed after algo/ rearragnement
 *
 * Revision 1.26  2003/06/02 14:04:49  kapustin
 * Progress indication-related updates
 *
 * Revision 1.25  2003/05/23 18:27:02  kapustin
 * Use weak comparisons in core recurrences. Adjust for new
 * transcript identifiers.
 *
 * Revision 1.24  2003/04/17 14:44:40  kapustin
 * A few changes to eliminate gcc warnings
 *
 * Revision 1.23  2003/04/14 19:00:55  kapustin
 * Add guide creation facility.  x_Run() -> x_Align()
 *
 * Revision 1.22  2003/04/10 19:15:16  kapustin
 * Introduce guiding hits approach
 *
 * Revision 1.21  2003/04/02 20:52:55  kapustin
 * Make FormatAsText virtual. Pass output string as a parameter.
 *
 * Revision 1.20  2003/03/31 15:32:05  kapustin
 * Calculate score independently from transcript
 *
 * Revision 1.19  2003/03/19 16:36:39  kapustin
 * Reverse memory limit condition
 *
 * Revision 1.18  2003/03/18 15:15:51  kapustin
 * Implement virtual memory checking function. Allow separate free end
 * gap specification
 *
 * Revision 1.17  2003/03/17 15:31:46  kapustin
 * Forced conversion to double in memory limit checking to avoid integer
 * overflow
 *
 * Revision 1.16  2003/03/14 19:18:50  kapustin
 * Add memory limit checking. Fix incorrect seq index references when
 * reporting about incorrect input seq character. Support all characters
 * within IUPACna coding
 *
 * Revision 1.15  2003/03/07 13:51:11  kapustin
 * Use zero-based indices to specify seq coordinates in ASN
 *
 * Revision 1.14  2003/03/05 20:13:48  kapustin
 * Simplify FormatAsText(). Fix FormatAsSeqAlign(). Convert sequence
 * alphabets to capitals
 *
 * Revision 1.13  2003/02/11 16:06:54  kapustin
 * Add end-space free alignment support
 *
 * Revision 1.12  2003/02/04 23:04:50  kapustin
 * Add progress callback support
 *
 * Revision 1.11  2003/01/31 02:55:48  lavr
 * User proper argument for ::time()
 *
 * Revision 1.10  2003/01/30 20:32:36  kapustin
 * Add EstiamteRunningTime()
 *
 * Revision 1.9  2003/01/28 12:39:03  kapustin
 * Implement ASN.1 and SeqAlign output
 *
 * Revision 1.8  2003/01/24 16:48:50  kapustin
 * Support more output formats - type 2 and gapped FastA
 *
 * Revision 1.7  2003/01/21 16:34:21  kapustin
 * Get rid of reverse() and reverse_copy() not supported by MSVC and/or ICC
 *
 * Revision 1.6  2003/01/21 12:41:37  kapustin
 * Use class neg infinity constant to specify least possible score
 *
 * Revision 1.5  2003/01/08 15:42:59  kapustin
 * Fix initialization for the first column of the backtrace matrix
 *
 * Revision 1.4  2003/01/06 18:04:10  kapustin
 * CNWAligner: add sequence size verification
 *
 * Revision 1.3  2002/12/31 13:53:13  kapustin
 * CNWAligner::Format() -- use CNcbiOstrstreamToString
 *
 * Revision 1.2  2002/12/12 17:59:28  kapustin
 * Enable spliced alignments
 *
 * Revision 1.1  2002/12/06 17:41:21  ivanov
 * Initial revision
 *
 * ===========================================================================
 */