alnmix.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:83k
源码类别:

生物技术

开发平台:

C/C++

  1. /*
  2.  * ===========================================================================
  3.  * PRODUCTION $Log: alnmix.cpp,v $
  4.  * PRODUCTION Revision 1000.5  2004/06/01 19:40:45  gouriano
  5.  * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.95
  6.  * PRODUCTION
  7.  * ===========================================================================
  8.  */
  9. /*  $Id: alnmix.cpp,v 1000.5 2004/06/01 19:40:45 gouriano Exp $
  10. * ===========================================================================
  11. *
  12. *                            PUBLIC DOMAIN NOTICE
  13. *               National Center for Biotechnology Information
  14. *
  15. *  This software/database is a "United States Government Work" under the
  16. *  terms of the United States Copyright Act.  It was written as part of
  17. *  the author's official duties as a United States Government employee and
  18. *  thus cannot be copyrighted.  This software/database is freely available
  19. *  to the public for use. The National Library of Medicine and the U.S.
  20. *  Government have not placed any restriction on its use or reproduction.
  21. *
  22. *  Although all reasonable efforts have been taken to ensure the accuracy
  23. *  and reliability of the software and data, the NLM and the U.S.
  24. *  Government do not and cannot warrant the performance or results that
  25. *  may be obtained by using this software or data. The NLM and the U.S.
  26. *  Government disclaim all warranties, express or implied, including
  27. *  warranties of performance, merchantability or fitness for any particular
  28. *  purpose.
  29. *
  30. *  Please cite the author in any work or product based on this material.
  31. *
  32. * ===========================================================================
  33. *
  34. * Author:  Kamen Todorov, NCBI
  35. *
  36. * File Description:
  37. *   Alignment merger
  38. *
  39. * ===========================================================================
  40. */
  41. #include <ncbi_pch.hpp>
  42. #include <objtools/alnmgr/alnmix.hpp>
  43. #include <objects/seqalign/Seq_align_set.hpp>
  44. #include <objects/seq/Bioseq.hpp>
  45. #include <objects/seqloc/Seq_id.hpp>
  46. // Object Manager includes
  47. #include <objmgr/scope.hpp>
  48. #include <algorithm>
  49. BEGIN_NCBI_SCOPE
  50. BEGIN_objects_SCOPE // namespace ncbi::objects::
  51. CAlnMix::CAlnMix(void)
  52.     : m_MergeFlags(0),
  53.       m_SingleRefseq(false),
  54.       m_ContainsAA(false),
  55.       m_ContainsNA(false)
  56. {
  57. }
  58. CAlnMix::CAlnMix(CScope& scope)
  59.     : m_Scope(&scope),
  60.       m_MergeFlags(0),
  61.       m_SingleRefseq(false),
  62.       m_ContainsAA(false),
  63.       m_ContainsNA(false)
  64. {
  65. }
  66. CAlnMix::~CAlnMix(void)
  67. {
  68. }
  69. inline
  70. bool CAlnMix::x_CompareAlnSeqScores(const CAlnMixSeq* aln_seq1,
  71.                                     const CAlnMixSeq* aln_seq2) 
  72. {
  73.     return aln_seq1->m_Score > aln_seq2->m_Score;
  74. }
  75. inline
  76. bool CAlnMix::x_CompareAlnMatchScores(const CRef<CAlnMixMatch>& aln_match1, 
  77.                                       const CRef<CAlnMixMatch>& aln_match2) 
  78. {
  79.     return aln_match1->m_Score > aln_match2->m_Score;
  80. }
  81. void CAlnMix::x_Reset()
  82. {
  83.     if (m_DS) {
  84.         m_DS.Reset();
  85.     }
  86.     if (m_Aln) {
  87.         m_Aln.Reset();
  88.     }
  89.     m_Segments.clear();
  90.     m_Rows.clear();
  91.     m_ExtraRows.clear();
  92.     ITERATE (TSeqs, seq_i, m_Seqs) {
  93.         (*seq_i)->m_Starts.clear();
  94.         (*seq_i)->m_ExtraRow = 0;
  95.     }
  96. }
  97. void CAlnMix::Merge(TMergeFlags flags)
  98. {
  99.     if (m_InputDSs.empty()) {
  100.         NCBI_THROW(CAlnException, eMergeFailure,
  101.                    "CAlnMix::Merge(): "
  102.                    "No alignments were added for merging.");
  103.     }
  104.     if ( !m_DS  ||  m_MergeFlags != flags) {
  105.         x_Reset();
  106.         m_MergeFlags = flags;
  107.         if (m_MergeFlags & fTryOtherMethodOnFail) {
  108.             try {
  109.                 x_Merge();
  110.             } catch(...) {
  111.                 if (m_MergeFlags & fGen2EST) {
  112.                     m_MergeFlags &= !fGen2EST;
  113.                 } else {
  114.                     m_MergeFlags |= fGen2EST;
  115.                 }
  116.                 try {
  117.                     x_Merge();
  118.                 } catch(...) {
  119.                     NCBI_THROW(CAlnException, eUnknownMergeFailure,
  120.                                "CAlnMix::x_Merge(): "
  121.                                "Both Gen2EST and Nucl2Nucl "
  122.                                "merges failed.");
  123.                 }
  124.             }
  125.         } else {
  126.             x_Merge();
  127.         }
  128.     }
  129. }
  130. void CAlnMix::Add(const CSeq_align& aln, TAddFlags flags)
  131. {
  132.     if (m_InputAlnsMap.find((void *)&aln) == m_InputAlnsMap.end()) {
  133.         // add only if not already added
  134.         m_InputAlnsMap[(void *)&aln] = &aln;
  135.         m_InputAlns.push_back(CConstRef<CSeq_align>(&aln));
  136.         if (aln.GetSegs().IsDenseg()) {
  137.             Add(aln.GetSegs().GetDenseg(), flags);
  138.         } else if (aln.GetSegs().IsStd()) {
  139.             CRef<CSeq_align> sa = aln.CreateDensegFromStdseg(this);
  140.             Add(*sa, flags);
  141.         } else if (aln.GetSegs().IsDisc()) {
  142.             ITERATE (CSeq_align_set::Tdata,
  143.                      aln_it,
  144.                      aln.GetSegs().GetDisc().Get()) {
  145.                 Add(**aln_it, flags);
  146.             }
  147.         }
  148.     }
  149. }
  150. void CAlnMix::Add(const CDense_seg &ds, TAddFlags flags)
  151. {
  152.     const CDense_seg* dsp = &ds;
  153.     if (m_InputDSsMap.find((void *)dsp) != m_InputDSsMap.end()) {
  154.         return; // it has already been added
  155.     }
  156.     x_Reset();
  157. #if _DEBUG
  158.     dsp->Validate(true);
  159. #endif    
  160.     m_InputDSsMap[(void *)dsp] = dsp;
  161.     // check if scope was given
  162.     if ( !(flags & fDontUseObjMgr  ||  m_Scope != 0) ) {
  163.         NCBI_THROW(CAlnException, eMergeFailure, 
  164.                    "CAlnMix::Add(): "
  165.                    "AlnMix will not create a scope for you. "
  166.                    "Either create one in advance and provide reference "
  167.                    "through CAlnMix constructor or use fDontUseObjMgr flag.");
  168.     }
  169.     // translate (extend with widths) the dense-seg if necessary
  170.     if (flags & fForceTranslation  &&  !dsp->IsSetWidths()) {
  171.         if (flags & fDontUseObjMgr) {
  172.             string errstr = string("CAlnMix::Add(): ") 
  173.                 + "Cannot force translation for Dense_seg "
  174.                 + NStr::IntToString(m_InputDSs.size() + 1) + ". "
  175.                 + "Neither CDense_seg::m_Widths are supplied, "
  176.                 + "nor OM is used to identify molecule type.";
  177.             NCBI_THROW(CAlnException, eMergeFailure, errstr);
  178.         } else {
  179.             m_InputDSs.push_back(x_ExtendDSWithWidths(*dsp));
  180.             dsp = m_InputDSs.back();
  181.         }
  182.     } else {
  183.         m_InputDSs.push_back(CConstRef<CDense_seg>(dsp));
  184.     }
  185.     if (flags & fDontUseObjMgr  &&  flags & fCalcScore) {
  186.         NCBI_THROW(CAlnException, eMergeFailure, "CAlnMix::Add(): "
  187.                    "fCalcScore cannot be used together with fDontUseObjMgr");
  188.     }
  189.     m_AddFlags = flags;
  190.     int ds_index = m_InputDSs.size();
  191.     vector<CRef<CAlnMixSeq> > ds_seq;
  192.     // check the widths
  193.     if (dsp->IsSetWidths()) {
  194.         if (dsp->GetWidths().size() != (size_t) dsp->GetDim()) {
  195.             string errstr = string("CAlnMix::Add(): ")
  196.                 + "Dense-seg "
  197.                 + NStr::IntToString(ds_index)
  198.                 + " has incorrect widths size ("
  199.                 + NStr::IntToString(dsp->GetWidths().size())
  200.                 + "). Should be equal to its dim ("
  201.                 + NStr::IntToString(dsp->GetDim()) + ").";
  202.             NCBI_THROW(CAlnException, eMergeFailure, errstr);
  203.         }
  204.     }
  205.     //store the seqs
  206.     for (CAlnMap::TNumrow row = 0;  row < dsp->GetDim();  row++) {
  207.         CRef<CAlnMixSeq> aln_seq;
  208.         if (m_AddFlags & fDontUseObjMgr) {
  209.             // identify sequences by their seq ids as provided by
  210.             // the dense seg (not as reliable as with OM, but faster)
  211.             CRef<CSeq_id> seq_id(new CSeq_id);
  212.             seq_id->Assign(*dsp->GetIds()[row]);
  213.             TSeqIdMap::iterator it = m_SeqIds.find(seq_id);
  214.             if (it == m_SeqIds.end()) {
  215.                 // add this seq id
  216.                 aln_seq = new CAlnMixSeq();
  217.                 m_SeqIds[seq_id] = aln_seq;
  218.                 aln_seq->m_SeqId = seq_id;
  219.                 aln_seq->m_DS_Count = 0;
  220.                 // add this sequence
  221.                 m_Seqs.push_back(aln_seq);
  222.             
  223.                 // AA or NA?
  224.                 if (dsp->IsSetWidths()) {
  225.                     if (dsp->GetWidths()[row] == 1) {
  226.                         aln_seq->m_IsAA = true;
  227.                         m_ContainsAA = true;
  228.                     } else {
  229.                         m_ContainsNA = true;
  230.                     }
  231.                 }
  232.                  
  233.             } else {
  234.                 aln_seq = it->second;
  235.             }
  236.             
  237.         } else {
  238.             // uniquely identify the bioseq
  239.             x_IdentifyAlnMixSeq(aln_seq, *(ds.GetIds())[row]);
  240. #if _DEBUG
  241.             // Verify the widths (if exist)
  242.             if (dsp->IsSetWidths()) {
  243.                 const int& width = dsp->GetWidths()[row];
  244.                 if (width == 1  &&  aln_seq->m_IsAA != true  ||
  245.                     width == 3  &&  aln_seq->m_IsAA != false) {
  246.                     string errstr = string("CAlnMix::Add(): ")
  247.                         + "Incorrect width(" 
  248.                         + NStr::IntToString(width) +
  249.                         ") or molecule type(" + 
  250.                         (aln_seq->m_IsAA ? "AA" : "NA") +
  251.                         ").";
  252.                     NCBI_THROW(CAlnException, eInvalidSegment,
  253.                                errstr);
  254.                 }
  255.             }
  256. #endif
  257.         }
  258.         aln_seq->m_DS_Count++;
  259.         ds_seq.push_back(aln_seq);
  260.     }
  261.     //record all alignment relations
  262.     int              seg_off = 0;
  263.     TSignedSeqPos    start1, start2;
  264.     TSeqPos          len;
  265.     bool             single_chunk;
  266.     CAlnMap::TNumrow first_non_gapped_row_found;
  267.     bool             strands_exist = 
  268.         dsp->GetStrands().size() == (size_t)dsp->GetNumseg() * dsp->GetDim();
  269.     for (CAlnMap::TNumseg seg =0;  seg < dsp->GetNumseg();  seg++) {
  270.         len = dsp->GetLens()[seg];
  271.         single_chunk = true;
  272.         for (CAlnMap::TNumrow row1 = 0;  row1 < dsp->GetDim();  row1++) {
  273.             if ((start1 = dsp->GetStarts()[seg_off + row1]) >= 0) {
  274.                 //search for a match for the piece of seq on row1
  275.                 CRef<CAlnMixSeq> aln_seq1 = ds_seq[row1];
  276.                 for (CAlnMap::TNumrow row2 = row1+1;
  277.                      row2 < dsp->GetDim();  row2++) {
  278.                     if ((start2 = dsp->GetStarts()[seg_off + row2]) >= 0) {
  279.                         //match found
  280.                         if (single_chunk) {
  281.                             single_chunk = false;
  282.                             first_non_gapped_row_found = row1;
  283.                         }
  284.                         
  285.                         //create the match
  286.                         CRef<CAlnMixMatch> match(new CAlnMixMatch);
  287.                         //add only pairs with the first_non_gapped_row_found
  288.                         //still, calc the score to be added to the seqs' scores
  289.                         if (row1 == first_non_gapped_row_found) {
  290.                             m_Matches.push_back(match);
  291.                         }
  292.                         CRef<CAlnMixSeq> aln_seq2 = ds_seq[row2];
  293.                         match->m_AlnSeq1 = aln_seq1;
  294.                         match->m_Start1 = start1;
  295.                         match->m_AlnSeq2 = aln_seq2;
  296.                         match->m_Start2 = start2;
  297.                         match->m_Len = len;
  298.                         match->m_DSIndex = ds_index;
  299.                         // determine the strand
  300.                         match->m_StrandsDiffer = false;
  301.                         ENa_strand strand1 = eNa_strand_plus;
  302.                         ENa_strand strand2 = eNa_strand_plus;
  303.                         if (strands_exist) {
  304.                             if (dsp->GetStrands()[seg_off + row1] 
  305.                                 == eNa_strand_minus) {
  306.                                 strand1 = eNa_strand_minus;
  307.                             }
  308.                             if (dsp->GetStrands()[seg_off + row2] 
  309.                                 == eNa_strand_minus) {
  310.                                 strand2 = eNa_strand_minus;
  311.                             }
  312.                             if (strand1 == eNa_strand_minus  &&
  313.                                 strand2 != eNa_strand_minus  ||
  314.                                 strand1 != eNa_strand_minus  &&
  315.                                 strand2 == eNa_strand_minus) {
  316.                                 match->m_StrandsDiffer = true;
  317.                             }
  318.                         }
  319.                         //Determine the score
  320.                         if (m_AddFlags & fCalcScore) {
  321.                             // calc the score by seq comp
  322.                             string s1, s2;
  323.                             if (strand1 == eNa_strand_minus) {
  324.                                 CSeqVector seq_vec = 
  325.                                     aln_seq1->m_BioseqHandle->GetSeqVector
  326.                                     (CBioseq_Handle::eCoding_Iupac,
  327.                                      CBioseq_Handle::eStrand_Minus);
  328.                                 TSeqPos size = seq_vec.size();
  329.                                 seq_vec.GetSeqData(size - (start1 + len),
  330.                                                    size - start1, 
  331.                                                    s1);
  332.                             } else {
  333.                                 aln_seq1->m_BioseqHandle->GetSeqVector
  334.                                     (CBioseq_Handle::eCoding_Iupac,
  335.                                      CBioseq_Handle::eStrand_Plus).
  336.                                     GetSeqData(start1, start1 + len, s1);
  337.                             }                                
  338.                             if (strand2 ==  eNa_strand_minus) {
  339.                                 CSeqVector seq_vec = 
  340.                                     aln_seq2->m_BioseqHandle->GetSeqVector
  341.                                     (CBioseq_Handle::eCoding_Iupac,
  342.                                      CBioseq_Handle::eStrand_Minus);
  343.                                 TSeqPos size = seq_vec.size();
  344.                                 seq_vec.GetSeqData(size - (start2 + len),
  345.                                                    size - start2, 
  346.                                                    s2);
  347.                             } else {
  348.                                 aln_seq2->m_BioseqHandle->GetSeqVector
  349.                                     (CBioseq_Handle::eCoding_Iupac,
  350.                                      CBioseq_Handle::eStrand_Plus).
  351.                                     GetSeqData(start2, start2 + len, s2);
  352.                             }
  353.                             //verify that we were able to load all data
  354.                             if (s1.length() != len || s2.length() != len) {
  355.                                 string errstr = string("CAlnMix::Add(): ")
  356.                                     + "Unable to load data for segment "
  357.                                     + NStr::IntToString(seg) +
  358.                                     ", rows " + NStr::IntToString(row1) +
  359.                                     " and " + NStr::IntToString(row2);
  360.                                 NCBI_THROW(CAlnException, eInvalidSegment,
  361.                                            errstr);
  362.                             }
  363.                             match->m_Score = 
  364.                                 CAlnVec::CalculateScore
  365.                                 (s1, s2, aln_seq1->m_IsAA, aln_seq2->m_IsAA);
  366.                         } else {
  367.                             match->m_Score = len;
  368.                         }
  369.                         
  370.                         aln_seq1->m_Score += match->m_Score;
  371.                         aln_seq2->m_Score += match->m_Score;
  372.                     }
  373.                 }
  374.                 if (single_chunk) {
  375.                     //record it
  376.                     CRef<CAlnMixMatch> match(new CAlnMixMatch);
  377.                     match->m_Score = 0;
  378.                     match->m_AlnSeq1 = aln_seq1;
  379.                     match->m_Start1 = start1;
  380.                     match->m_AlnSeq2 = 0;
  381.                     match->m_Start2 = 0;
  382.                     match->m_Len = len;
  383.                     match->m_StrandsDiffer = false;
  384.                     match->m_DSIndex = m_InputDSs.size();
  385.                     m_Matches.push_back(match);
  386.                 }
  387.             }
  388.         }
  389.         seg_off += dsp->GetDim();
  390.     }
  391. }
  392. void CAlnMix::x_Merge()
  393. {
  394.     bool first_refseq = true; // mark the first loop
  395.     if (m_MergeFlags & fSortSeqsByScore) {
  396.         stable_sort(m_Seqs.begin(), m_Seqs.end(), x_CompareAlnSeqScores);
  397.     }
  398.     // Find the refseq (if such exists)
  399.     {{
  400.         m_SingleRefseq = false;
  401.         m_IndependentDSs = m_InputDSs.size() > 1;
  402.         unsigned int ds_cnt;
  403.         NON_CONST_ITERATE (TSeqs, it, m_Seqs){
  404.             ds_cnt = (*it)->m_DS_Count;
  405.             if (ds_cnt > 1) {
  406.                 m_IndependentDSs = false;
  407.             }
  408.             if (ds_cnt == m_InputDSs.size()) {
  409.                 m_SingleRefseq = true;
  410.                 if ( !first_refseq ) {
  411.                     CAlnMixSeq * refseq = *it;
  412.                     m_Seqs.erase(it);
  413.                     m_Seqs.insert(m_Seqs.begin(), refseq);
  414.                 }
  415.                 break;
  416.             }
  417.             first_refseq = false;
  418.         }
  419.     }}
  420.     // Index the sequences
  421.     {{
  422.         int seq_idx=0;
  423.         ITERATE (TSeqs, seq_i, m_Seqs) {
  424.             (*seq_i)->m_SeqIndex = seq_idx++;
  425.         }
  426.     }}
  427.     // Set the widths if the mix contains both AA & NA
  428.     // or in case we force translation
  429.     if (m_ContainsNA  &&  m_ContainsAA  ||  m_AddFlags & fForceTranslation) {
  430.         ITERATE (TSeqs, seq_i, m_Seqs) {
  431.             (*seq_i)->m_Width = (*seq_i)->m_IsAA ? 1 : 3;
  432.         }
  433.     }
  434.     // Sort matches by score
  435.     stable_sort(m_Matches.begin(), m_Matches.end(), x_CompareAlnMatchScores);
  436.     CAlnMixSeq * refseq = 0, * seq1 = 0, * seq2 = 0;
  437.     TSeqPos start, start1, start2, len, curr_len;
  438.     int width1, width2;
  439.     CAlnMixMatch * match;
  440.     CAlnMixSeq::TMatchList::iterator match_list_iter1, match_list_iter2;
  441.     CAlnMixSeq::TMatchList::iterator match_list_i;
  442.     TSecondRowFits second_row_fits;
  443.     refseq = *(m_Seqs.begin());
  444.     TMatches::iterator match_i = m_Matches.begin();
  445.     CRef<CAlnMixSegment> seg;
  446.     CAlnMixSeq::TStarts::iterator start_i, lo_start_i, hi_start_i, tmp_start_i;
  447.     first_refseq = true; // mark the first loop
  448.     while (true) {
  449.         // reached the end?
  450.         if (first_refseq ?
  451.             match_i == m_Matches.end()  &&  m_Matches.size() :
  452.             match_list_i == refseq->m_MatchList.end()) {
  453.             // move on to the next refseq
  454.             refseq->m_RefBy = 0;
  455.             // try to find the best scoring 'connected' candidate
  456.             ITERATE (TSeqs, it, m_Seqs){
  457.                 if ( !((*it)->m_MatchList.empty())  &&
  458.                      (*it)->m_RefBy == refseq) {
  459.                     refseq = *it;
  460.                     break;
  461.                 }
  462.             }
  463.             if (refseq->m_RefBy == 0) {
  464.                 // no candidate was found 'connected' to the refseq
  465.                 // continue with the highest scoring candidate
  466.                 ITERATE (TSeqs, it, m_Seqs){
  467.                     if ( !((*it)->m_MatchList.empty()) ) {
  468.                         refseq = *it;
  469.                         break;
  470.                     }
  471.                 }
  472.             }
  473.             if (refseq->m_MatchList.empty()) {
  474.                 break; // done
  475.             } else {
  476.                 first_refseq = false;
  477.                 match_list_i = refseq->m_MatchList.begin();
  478.             }
  479.             continue;
  480.         } else {
  481.             // iterate
  482.             match = first_refseq ? *(match_i++) : *(match_list_i++);
  483.         }
  484.         curr_len = len = match->m_Len;
  485.         // is it a match with this refseq?
  486.         if (match->m_AlnSeq1 == refseq) {
  487.             seq1 = match->m_AlnSeq1;
  488.             start1 = match->m_Start1;
  489.             match_list_iter1 = match->m_MatchIter1;
  490.             seq2 = match->m_AlnSeq2;
  491.             start2 = match->m_Start2;
  492.             match_list_iter2 = match->m_MatchIter2;
  493.         } else if (match->m_AlnSeq2 == refseq) {
  494.             seq1 = match->m_AlnSeq2;
  495.             start1 = match->m_Start2;
  496.             match_list_iter1 = match->m_MatchIter2;
  497.             seq2 = match->m_AlnSeq1;
  498.             start2 = match->m_Start1;
  499.             match_list_iter2 = match->m_MatchIter1;
  500.         } else {
  501.             seq1 = match->m_AlnSeq1;
  502.             seq2 = match->m_AlnSeq2;
  503.             // mark the two refseqs, they are candidates for next refseq(s)
  504.             seq1->m_MatchList.push_back(match);
  505.             (match->m_MatchIter1 = seq1->m_MatchList.end())--;
  506.             if (seq2) {
  507.                 seq2->m_MatchList.push_back(match);
  508.                 (match->m_MatchIter2 = seq2->m_MatchList.end())--;
  509.             }
  510.             // mark that there's no match with this refseq
  511.             seq1 = 0;
  512.         }
  513.         // save the match info into the segments map
  514.         if (seq1) {
  515.             // order the match
  516.             match->m_AlnSeq1 = seq1;
  517.             match->m_Start1 = start1;
  518.             match->m_AlnSeq2 = seq2;
  519.             match->m_Start2 = start2;
  520.             width1 = seq1->m_Width;
  521.             if (seq2) {
  522.                 width2 = seq2->m_Width;
  523.             }
  524.             // in case of translated refseq
  525.             if (width1 == 3) {
  526.                 // check the frame 
  527.                 int frame = start1 % 3;
  528.                 if (seq1->m_Starts.empty()) {
  529.                     seq1->m_Frame = frame;
  530.                 } else {
  531.                     while (seq1->m_Frame != frame) {
  532.                         if (!seq1->m_ExtraRow) {
  533.                             // create an extra row
  534.                             CRef<CAlnMixSeq> row (new CAlnMixSeq);
  535.                             row->m_BioseqHandle = seq1->m_BioseqHandle;
  536.                             row->m_SeqId = seq1->m_SeqId;
  537.                             row->m_Width = seq1->m_Width;
  538.                             row->m_Frame = frame;
  539.                             row->m_SeqIndex = seq1->m_SeqIndex;
  540.                             if (m_MergeFlags & fQuerySeqMergeOnly) {
  541.                                 row->m_DSIndex = match->m_DSIndex;
  542.                             }
  543.                             m_ExtraRows.push_back(row);
  544.                             seq1->m_ExtraRow = row;
  545.                             seq1 = match->m_AlnSeq1 = seq1->m_ExtraRow;
  546.                             break;
  547.                         }
  548.                         seq1 = match->m_AlnSeq1 = seq1->m_ExtraRow;
  549.                     }
  550.                 }
  551.             }
  552.             // this match is used, erase from seq1 list
  553.             if ( !first_refseq ) {
  554.                 if ( !refseq->m_MatchList.empty() ) {
  555.                     refseq->m_MatchList.erase(match_list_iter1);
  556.                 }
  557.             }
  558.             // if a subject sequence place it in the proper row
  559.             if ( !first_refseq  &&  m_MergeFlags & fQuerySeqMergeOnly) {
  560.                 bool proper_row_found = false;
  561.                 while (true) {
  562.                     if (seq1->m_DSIndex == match->m_DSIndex) {
  563.                         proper_row_found = true;
  564.                         break;
  565.                     } else {
  566.                         if (seq1->m_ExtraRow) {
  567.                             seq1 = match->m_AlnSeq2 = seq1->m_ExtraRow;
  568.                         } else {
  569.                             break;
  570.                         }
  571.                     }
  572.                 }
  573.                 if ( !proper_row_found ) {
  574.                     NCBI_THROW(CAlnException, eMergeFailure,
  575.                                "CAlnMix::x_Merge(): "
  576.                                "Proper row not found for the match. "
  577.                                "Cannot use fQuerySeqMergeOnly?");
  578.                 }
  579.             }
  580.             
  581.             CAlnMixSeq::TStarts& starts = seq1->m_Starts;
  582.             if (seq2) {
  583.                 // mark it, it is linked to the refseq
  584.                 seq2->m_RefBy = refseq;
  585.                 // this match is used erase from seq2 list
  586.                 if ( !first_refseq ) {
  587.                     if ( !seq2->m_MatchList.empty() ) {
  588.                         seq2->m_MatchList.erase(match_list_iter2);
  589.                     }
  590.                 }
  591.             }
  592.             start_i = starts.end();
  593.             lo_start_i = starts.end();
  594.             hi_start_i = starts.end();
  595.             if (!starts.size()) {
  596.                 // no starts exist yet
  597.                 if ( !m_IndependentDSs ) {
  598.                     // TEMPORARY, for the single refseq version of mix,
  599.                     // clear all MatchLists and exit
  600.                     if ( !(m_SingleRefseq  ||  first_refseq) ) {
  601.                         ITERATE (TSeqs, it, m_Seqs){
  602.                             if ( !((*it)->m_MatchList.empty()) ) {
  603.                                 (*it)->m_MatchList.clear();
  604.                             }
  605.                         }
  606.                         break; 
  607.                     }
  608.                 }
  609.                 // this seq has not yet been used, set the strand
  610.                 seq1->m_PositiveStrand = ! (m_MergeFlags & fNegativeStrand);
  611.                 //create the first one
  612.                 seg = new CAlnMixSegment;
  613.                 seg->m_Len = len;
  614.                 seg->m_DSIndex = match->m_DSIndex;
  615.                 starts[start1] = seg;
  616.                 seg->m_StartIts[seq1] = 
  617.                     lo_start_i = hi_start_i = starts.begin();
  618.                 second_row_fits = eSecondRowFitsOk;
  619.                 // DONE!
  620.             } else {
  621.                 // some starts exist already
  622.                 if (seq2) {
  623.                     // check if the second row fits
  624.                     // this will truncate the match if 
  625.                     // there's an inconsistent overlap
  626.                     // and truncation was requested
  627.                     second_row_fits = x_SecondRowFits(match);
  628.                     {{
  629.                         // reset the ones below,
  630.                         // since match may have been modified
  631.                         seq1 = match->m_AlnSeq1;
  632.                         start1 = match->m_Start1;
  633.                         match_list_iter1 = match->m_MatchIter1;
  634.                         seq2 = match->m_AlnSeq2;
  635.                         start2 = match->m_Start2;
  636.                         match_list_iter2 = match->m_MatchIter2;
  637.                         curr_len = len = match->m_Len;
  638.                     }}
  639.                     if (second_row_fits == eIgnoreMatch) {
  640.                         continue;
  641.                     }
  642.                 }
  643.                 // look ahead
  644.                 if ((lo_start_i = start_i = starts.lower_bound(start1))
  645.                     == starts.end()  ||
  646.                     start1 < start_i->first) {
  647.                     // the start position does not exist
  648.                     if (lo_start_i != starts.begin()) {
  649.                         --lo_start_i;
  650.                     }
  651.                 }
  652.                 // look back
  653.                 if (hi_start_i == starts.end()  &&  start_i != lo_start_i) {
  654.                     CAlnMixSegment * prev_seg = lo_start_i->second;
  655.                     if (lo_start_i->first + prev_seg->m_Len * width1 >
  656.                         start1) {
  657.                         // x----..   becomes  x-x--..
  658.                         //   x--..
  659.                         
  660.                         TSeqPos len1 = (start1 - lo_start_i->first) / width1;
  661.                         TSeqPos len2 = prev_seg->m_Len - len1;
  662.                         
  663.                         // create the second seg
  664.                         seg = new CAlnMixSegment;
  665.                         seg->m_Len = len2;
  666.                         seg->m_DSIndex = match->m_DSIndex;
  667.                         starts[start1] = seg;
  668.                         
  669.                         // create rows info
  670.                         ITERATE (CAlnMixSegment::TStartIterators, it, 
  671.                                  prev_seg->m_StartIts) {
  672.                             CAlnMixSeq * seq = it->first;
  673.                             tmp_start_i = it->second;
  674.                             if (seq->m_PositiveStrand ==
  675.                                 seq1->m_PositiveStrand) {
  676.                                 seq->m_Starts
  677.                                     [tmp_start_i->first + len1 * seq->m_Width]
  678.                                     = seg;
  679.                                 seg->m_StartIts[seq] = ++tmp_start_i;
  680.                             } else {
  681.                                 seq->m_Starts
  682.                                     [tmp_start_i->first + len2 * seq->m_Width]
  683.                                     = prev_seg;
  684.                                 seq->m_Starts[tmp_start_i->first] = seg;
  685.                                 seg->m_StartIts[seq] = tmp_start_i;
  686.                                 prev_seg->m_StartIts[seq] = ++tmp_start_i;
  687.                             }
  688.                         }
  689.                         
  690.                         // truncate the first seg
  691.                         prev_seg->m_Len = len1;
  692.                         
  693.                         if (start_i != starts.begin()) {
  694.                             start_i--; // point to the newly created start
  695.                         }
  696.                     }
  697.                     if (lo_start_i != starts.end()) {
  698.                         lo_start_i++;
  699.                     }
  700.                 }
  701.             }
  702.             // loop through overlapping segments
  703.             start = start1;
  704.             while (hi_start_i == starts.end()) {
  705.                 if (start_i != starts.end()  &&  start_i->first == start) {
  706.                     CAlnMixSegment * prev_seg = start_i->second;
  707.                     if (prev_seg->m_Len > curr_len) {
  708.                         // x-------)  becomes  x----)x--)
  709.                         // x----)
  710.                         
  711.                         // create the second seg
  712.                         seg = new CAlnMixSegment;
  713.                         TSeqPos len1 = 
  714.                             seg->m_Len = prev_seg->m_Len - curr_len;
  715.                         start += curr_len * width1;
  716.                         // truncate the first seg
  717.                         prev_seg->m_Len = curr_len;
  718.                         
  719.                         // create rows info
  720.                         ITERATE (CAlnMixSegment::TStartIterators, it, 
  721.                                 prev_seg->m_StartIts) {
  722.                             CAlnMixSeq * seq = it->first;
  723.                             tmp_start_i = it->second;
  724.                             if (seq->m_PositiveStrand ==
  725.                                 seq1->m_PositiveStrand) {
  726.                                 seq->m_Starts[tmp_start_i->first +
  727.                                              curr_len * seq->m_Width]
  728.                                     = seg;
  729.                                 seg->m_StartIts[seq] = ++tmp_start_i;
  730.                             } else{
  731.                                 seq->m_Starts[tmp_start_i->first +
  732.                                              len1 * seq->m_Width]
  733.                                     = prev_seg;
  734.                                 seq->m_Starts[tmp_start_i->first] = seg;
  735.                                 seg->m_StartIts[seq] = tmp_start_i;
  736.                                 prev_seg->m_StartIts[seq] = ++tmp_start_i;
  737.                             }
  738.                         }
  739.                         hi_start_i = start_i; // DONE!
  740.                     } else if (curr_len == prev_seg->m_Len) {
  741.                         // x----)
  742.                         // x----)
  743.                         hi_start_i = start_i; // DONE!
  744.                     } else {
  745.                         // x----)     becomes  x----)x--)
  746.                         // x-------)
  747.                         start += prev_seg->m_Len * width1;
  748.                         curr_len -= prev_seg->m_Len;
  749.                         if (start_i != starts.end()) {
  750.                             start_i++;
  751.                         }
  752.                     }
  753.                 } else {
  754.                     seg = new CAlnMixSegment;
  755.                     starts[start] = seg;
  756.                     tmp_start_i = start_i;
  757.                     if (tmp_start_i != starts.begin()) {
  758.                         tmp_start_i--;
  759.                     }
  760.                     seg->m_StartIts[seq1] = tmp_start_i;
  761.                     if (start_i != starts.end()  &&
  762.                         start + curr_len * width1 > start_i->first) {
  763.                         //       x--..
  764.                         // x--------..
  765.                         seg->m_Len = (start_i->first - start) / width1;
  766.                         seg->m_DSIndex = match->m_DSIndex;
  767.                     } else {
  768.                         //       x-----)
  769.                         // x---)
  770.                         seg->m_Len = curr_len;
  771.                         seg->m_DSIndex = match->m_DSIndex;
  772.                         hi_start_i = start_i;
  773.                         if (hi_start_i != starts.begin()) {
  774.                             hi_start_i--; // DONE!
  775.                         }
  776.                     }
  777.                     start += seg->m_Len * width1;
  778.                     curr_len -= seg->m_Len;
  779.                     if (lo_start_i == start_i) {
  780.                         if (lo_start_i != starts.begin()) {
  781.                             lo_start_i--;
  782.                         }
  783.                     }
  784.                 }
  785.             }
  786.                  
  787.             // try to resolve the second row
  788.             if (seq2) {
  789.                 // set the frame if not initialized
  790.                 if (width2 == 3  &&  seq2->m_Starts.empty()) {
  791.                     seq2->m_Frame = start2 % 3;
  792.                 }
  793.                 // create a copy of the match,
  794.                 // which we could work with temporarily
  795.                 // without modifying the original
  796.                 CAlnMixMatch tmp_match = *match;
  797.                 match = &tmp_match; // point to the new tmp_match
  798.                 while (second_row_fits != eSecondRowFitsOk  &&
  799.                        second_row_fits != eIgnoreMatch) {
  800.                     if (!seq2->m_ExtraRow) {
  801.                         // create an extra row
  802.                         CRef<CAlnMixSeq> row (new CAlnMixSeq);
  803.                         row->m_BioseqHandle = seq2->m_BioseqHandle;
  804.                         row->m_SeqId = seq2->m_SeqId;
  805.                         row->m_Width = seq2->m_Width;
  806.                         row->m_Frame = start2 % 3;
  807.                         row->m_SeqIndex = seq2->m_SeqIndex;
  808.                         if (m_MergeFlags & fQuerySeqMergeOnly) {
  809.                             row->m_DSIndex = match->m_DSIndex;
  810.                         }
  811.                         m_ExtraRows.push_back(row);
  812.                         seq2->m_ExtraRow = row;
  813.                         seq2 = match->m_AlnSeq2 = seq2->m_ExtraRow;
  814.                         break;
  815.                     }
  816.                     seq2 = match->m_AlnSeq2 = seq2->m_ExtraRow;
  817.                     second_row_fits = x_SecondRowFits(match);
  818.                     {{
  819.                         // reset the ones below,
  820.                         // since match may have been modified
  821.                         seq1 = match->m_AlnSeq1;
  822.                         start1 = match->m_Start1;
  823.                         match_list_iter1 = match->m_MatchIter1;
  824.                         seq2 = match->m_AlnSeq2;
  825.                         start2 = match->m_Start2;
  826.                         match_list_iter2 = match->m_MatchIter2;
  827.                         curr_len = len = match->m_Len;
  828.                     }}
  829.                 }
  830.                 if (second_row_fits == eIgnoreMatch) {
  831.                     continue;
  832.                 }
  833.                 if (m_MergeFlags & fTruncateOverlaps) {
  834.                     // we need to reset these shtorcuts
  835.                     // in case the match was truncated
  836.                     start1 = match->m_Start1;
  837.                     start2 = match->m_Start2;
  838.                     len = match->m_Len;
  839.                 }
  840.                 // set the strand if first time
  841.                 if (seq2->m_Starts.empty()) {
  842.                     seq2->m_PositiveStrand = 
  843.                         (seq1->m_PositiveStrand ?
  844.                          !match->m_StrandsDiffer :
  845.                          match->m_StrandsDiffer);
  846.                 }
  847.                 // create row info
  848.                 CAlnMixSeq::TStarts& starts2 = match->m_AlnSeq2->m_Starts;
  849.                 start = start2;
  850.                 CAlnMixSeq::TStarts::iterator start2_i
  851.                     = starts2.lower_bound(start2);
  852.                 start_i = match->m_StrandsDiffer ? hi_start_i : lo_start_i;
  853.                 while(start < start2 + len * width2) {
  854.                     if (start2_i != starts2.end() &&
  855.                         start2_i->first == start) {
  856.                         // this position already exists
  857.                         if (start2_i->second != start_i->second) {
  858.                             // merge the two segments
  859.                             // store the seg in a CRef to delay its deletion
  860.                             // until after the iteration on it is finished
  861.                             CRef<CAlnMixSegment> tmp_seg = start2_i->second;
  862.                             ITERATE (CAlnMixSegment::TStartIterators,
  863.                                      it, 
  864.                                      tmp_seg->m_StartIts) {
  865.                                 CAlnMixSeq * tmp_seq = it->first;
  866.                                 tmp_start_i = it->second;
  867.                                 tmp_start_i->second = start_i->second;
  868.                                 start_i->second->m_StartIts[tmp_seq] =
  869.                                     tmp_start_i;
  870.                             }
  871.                         }
  872.                     } else {
  873.                         // this position does not exist, create it
  874.                         seq2->m_Starts[start] = start_i->second;
  875.                         // start2_i != starts.begin() because we just 
  876.                         // made an insertion, so decrement is ok
  877.                         start2_i--;
  878.                         
  879.                         // point this segment's row start iterator
  880.                         start_i->second->m_StartIts[seq2] = start2_i;
  881.                     }
  882.                     // increment values
  883.                     start += start_i->second->m_Len * width2;
  884.                     if (start2_i != starts2.end()) {
  885.                         start2_i++;
  886.                     }
  887.                     if (match->m_StrandsDiffer) {
  888.                         if (start_i != starts.begin()) {
  889.                             start_i--;
  890.                         }
  891.                     } else {
  892.                         if (start_i != starts.end()) {
  893.                             start_i++;
  894.                         }
  895.                     }
  896.                 }
  897.             }
  898.         }
  899.     }
  900.     x_CreateRowsVector();
  901.     x_CreateSegmentsVector();
  902.     x_CreateDenseg();
  903. }
  904. CAlnMix::TSecondRowFits
  905. CAlnMix::x_SecondRowFits(CAlnMixMatch * match) const
  906. {
  907.     CAlnMixSeq::TStarts&          starts1 = match->m_AlnSeq1->m_Starts;
  908.     CAlnMixSeq::TStarts&          starts2 = match->m_AlnSeq2->m_Starts;
  909.     CAlnMixSeq                  * seq1    = match->m_AlnSeq1,
  910.                                 * seq2    = match->m_AlnSeq2;
  911.     TSeqPos&                      start1  = match->m_Start1;
  912.     TSeqPos&                      start2  = match->m_Start2;
  913.     TSeqPos&                      len     = match->m_Len;
  914.     const int&                    width1  = seq1->m_Width;
  915.     const int&                    width2  = seq2->m_Width;
  916.     CAlnMixSeq::TStarts::iterator start_i;
  917.     TSignedSeqPos                 delta, delta1, delta2;
  918.     // subject sequences go on separate rows if requested
  919.     if (m_MergeFlags & fQuerySeqMergeOnly) {
  920.         if (seq2->m_DSIndex) {
  921.             if (seq2->m_DSIndex == match->m_DSIndex) {
  922.                 return eSecondRowFitsOk;
  923.             } else {
  924.                 return eForceSeparateRow;
  925.             }
  926.         } else {
  927.             seq2->m_DSIndex = match->m_DSIndex;
  928.             return eSecondRowFitsOk;
  929.         }
  930.     }
  931.     if ( !starts2.empty() ) {
  932.         // check strand
  933.         if (seq2->m_PositiveStrand !=
  934.             (seq1->m_PositiveStrand ?
  935.              !match->m_StrandsDiffer :
  936.              match->m_StrandsDiffer)) {
  937.             return eInconsistentStrand;
  938.         }
  939.         // check frame
  940.         if (seq2->m_Width == 3  &&  seq2->m_Frame != start2 % 3) {
  941.             return eInconsistentFrame;
  942.         }
  943.         start_i = starts2.lower_bound(start2);
  944.         // check below
  945.         if (start_i != starts2.begin()) {
  946.             start_i--;
  947.             
  948.             // check for inconsistency on the first row
  949.             if ( !m_IndependentDSs ) {
  950.                 CAlnMixSegment::TStartIterators::iterator start_it_i =
  951.                     start_i->second->m_StartIts.find(seq1);
  952.                 if (start_it_i != start_i->second->m_StartIts.end()) {
  953.                     if (match->m_StrandsDiffer) {
  954.                         delta = start1 + len * width1 - start_it_i->second->first;
  955.                         if (delta > 0) {
  956.                             // target above
  957.                             // x----- x-)-)
  958.                             // (----x (---x
  959.                             // target below
  960.                             if (m_MergeFlags & fTruncateOverlaps) {
  961.                                 delta /= width1;
  962.                                 if ((len -= delta) > 0) {
  963.                                     start2 += delta * width2;
  964.                                 } else {
  965.                                     return eIgnoreMatch;
  966.                                 }
  967.                             } else {
  968.                                 return eFirstRowOverlapAbove;
  969.                             }
  970.                         }
  971.                     } else {
  972.                         delta = start_it_i->second->first
  973.                             + start_i->second->m_Len * width1
  974.                             - start1;
  975.                         if (delta > 0) {
  976.                             // below target
  977.                             // x---- x-)--)
  978.                             // x---) x----)
  979.                             // below target
  980.                             if (m_MergeFlags & fTruncateOverlaps) {
  981.                                 delta /= width1;
  982.                                 if ((len -= delta) > 0) {
  983.                                     start1 += delta * width1;
  984.                                     start2 += delta * width2;
  985.                                 } else {
  986.                                     return eIgnoreMatch;
  987.                                 }
  988.                             } else {
  989.                                 return eFirstRowOverlapBelow;
  990.                             }
  991.                         }
  992.                     }
  993.                 }
  994.             }
  995.             // check for overlap with the segment below on second row
  996.             if ((delta = start_i->first + start_i->second->m_Len * width2
  997.                  - start2) > 0) {
  998.                 //       target
  999.                 // ----- ------
  1000.                 // x---- x-)--)
  1001.                 // below target
  1002.                 if (m_MergeFlags & fTruncateOverlaps) {
  1003.                     delta /= width2;
  1004.                     if ((len -= delta) > 0) {
  1005.                         start2 += delta * width2;
  1006.                         if ( !match->m_StrandsDiffer ) {
  1007.                             start1 += delta * width1;
  1008.                         }
  1009.                     } else {
  1010.                         return eIgnoreMatch;
  1011.                     }
  1012.                 } else {
  1013.                     return eSecondRowOverlap;
  1014.                 }
  1015.             }
  1016.             if (start_i != starts2.end()) {
  1017.                 start_i++;
  1018.             }
  1019.         }
  1020.         // check the overlapping area for consistency
  1021.         while (start_i != starts2.end()  &&  
  1022.                start_i->first < start2 + len * width2) {
  1023.             if ( !m_IndependentDSs ) {
  1024.                 CAlnMixSegment::TStartIterators::iterator start_it_i =
  1025.                     start_i->second->m_StartIts.find(seq1);
  1026.                 if (start_it_i != start_i->second->m_StartIts.end()) {
  1027.                     if (match->m_StrandsDiffer) {
  1028.                         // x---..- x---..--)
  1029.                         // (---..- (--x..--x
  1030.                         delta1 = (start1 - start_it_i->second->first) / width1 +
  1031.                             len - start_i->second->m_Len;
  1032.                     } else {
  1033.                         // x--..- x---...-)
  1034.                         // x--..- x---...-)
  1035.                         delta1 = (start_it_i->second->first - start1) / width1;
  1036.                     }
  1037.                     delta2 = (start_i->first - start2) / width2;
  1038.                     if (delta1 != delta2) {
  1039.                         if (m_MergeFlags & fTruncateOverlaps) {
  1040.                             delta = (delta1 < delta2 ? delta1 : delta2);
  1041.                             if (match->m_StrandsDiffer) {
  1042.                                 start1 += (len - delta) * width1;
  1043.                             }
  1044.                             len = delta;
  1045.                         } else {
  1046.                             return eInconsistentOverlap;
  1047.                         }
  1048.                     }
  1049.                 }
  1050.             }
  1051.             start_i++;
  1052.         }
  1053.         // check above for consistency
  1054.         if (start_i != starts2.end()) {
  1055.             if ( !m_IndependentDSs ) {
  1056.                 CAlnMixSegment::TStartIterators::iterator start_it_i =
  1057.                     start_i->second->m_StartIts.find(seq1);
  1058.                 if (start_it_i != start_i->second->m_StartIts.end()) {
  1059.                     if (match->m_StrandsDiffer) {
  1060.                         delta = start_it_i->second->first + 
  1061.                             start_i->second->m_Len * width1 - start1;
  1062.                         if (delta > 0) {
  1063.                             // below target
  1064.                             // x---- x-)--)
  1065.                             // (---x (----x
  1066.                             // above target
  1067.                             if (m_MergeFlags & fTruncateOverlaps) {
  1068.                                 if ((len -= delta / width1) > 0) {
  1069.                                     start1 += delta;
  1070.                                 } else {
  1071.                                     return eIgnoreMatch;
  1072.                                 }
  1073.                             } else {
  1074.                                 return eFirstRowOverlapBelow;
  1075.                             }
  1076.                         }
  1077.                     } else {
  1078.                         delta = start1 + len * width1 - start_it_i->second->first;
  1079.                         if (delta > 0) {
  1080.                             // target above
  1081.                             // x--x-) ----)
  1082.                             // x----) x---)
  1083.                             // target above
  1084.                             if (m_MergeFlags & fTruncateOverlaps) {
  1085.                                 if ((len -= delta / width1) <= 0) {
  1086.                                     return eIgnoreMatch;
  1087.                                 }
  1088.                             } else {
  1089.                                 return eFirstRowOverlapAbove;
  1090.                             }
  1091.                         }
  1092.                     }
  1093.                 }
  1094.             }
  1095.         }
  1096.         // check for inconsistent matches
  1097.         if ((start_i = starts1.find(start1)) == starts1.end() ||
  1098.             start_i->first != start1) {
  1099.             // commenting out for now, since moved the function call ahead            
  1100. //             NCBI_THROW(CAlnException, eMergeFailure,
  1101. //                        "CAlnMix::x_SecondRowFits(): "
  1102. //                        "Internal error: starts1 do not match");
  1103.         } else {
  1104.             CAlnMixSegment::TStartIterators::iterator it;
  1105.             TSeqPos tmp_start =
  1106.                 match->m_StrandsDiffer ? start2 + len * width2 : start2;
  1107.             while (start_i != starts1.end()  &&
  1108.                    start_i->first < start1 + len * width1) {
  1109.                 CAlnMixSegment::TStartIterators& its = 
  1110.                     start_i->second->m_StartIts;
  1111.                 if (match->m_StrandsDiffer) {
  1112.                     tmp_start -= start_i->second->m_Len * width2;
  1113.                 }
  1114.                 if ((it = its.find(seq2)) != its.end()) {
  1115.                     if (it->second->first != tmp_start) {
  1116.                         // found an inconsistent prev match
  1117.                         return eSecondRowInconsistency;
  1118.                     }
  1119.                 }
  1120.                 if ( !match->m_StrandsDiffer ) {
  1121.                     tmp_start += start_i->second->m_Len * width2;
  1122.                 }
  1123.                 start_i++;
  1124.             }
  1125.         }
  1126.     }
  1127.     return eSecondRowFitsOk;
  1128. }
  1129. void CAlnMix::x_CreateRowsVector()
  1130. {
  1131.     m_Rows.clear();
  1132.     int count = 0;
  1133.     ITERATE (TSeqs, i, m_Seqs) {
  1134.         CAlnMixSeq * seq = *i;
  1135.         m_Rows.push_back(seq);
  1136.         seq->m_RowIndex = count++;
  1137.         while ( (seq = seq->m_ExtraRow) != NULL ) {
  1138.             seq->m_RowIndex = count++;
  1139.             m_Rows.push_back(seq);
  1140.         }
  1141.     }
  1142. }
  1143. void CAlnMix::x_CreateSegmentsVector()
  1144. {
  1145.     TSegmentsContainer gapped_segs;
  1146.     // init the start iterators for each row
  1147.     NON_CONST_ITERATE (TSeqs, row_i, m_Rows) {
  1148.         CAlnMixSeq * row = *row_i;
  1149.         if ( !row->m_Starts.empty() ) {
  1150.             if (row->m_PositiveStrand) {
  1151.                 row->m_StartIt = row->m_Starts.begin();
  1152.             } else {
  1153.                 row->m_StartIt = row->m_Starts.end();
  1154.                 row->m_StartIt--;
  1155.             }
  1156.         } else {
  1157.             row->m_StartIt = row->m_Starts.end();
  1158. #if _DEBUG
  1159.             string errstr =
  1160.                 string("CAlnMix::x_CreateSegmentsVector():") +
  1161.                 " Internal error: no starts for row " +
  1162.                 NStr::IntToString(row->m_RowIndex) +
  1163.                 " (seq " +
  1164.                 NStr::IntToString(row->m_SeqIndex) + ").";
  1165.             NCBI_THROW(CAlnException, eMergeFailure, errstr);
  1166. #endif
  1167.         }
  1168.     }
  1169.     // init the start iterators for each extra row
  1170.     NON_CONST_ITERATE (list<CRef<CAlnMixSeq> >, row_i, m_ExtraRows) {
  1171.         CAlnMixSeq * row = *row_i;
  1172.         if ( !row->m_Starts.empty() ) {
  1173.             if (row->m_PositiveStrand) {
  1174.                 row->m_StartIt = row->m_Starts.begin();
  1175.             } else {
  1176.                 row->m_StartIt = row->m_Starts.end();
  1177.                 row->m_StartIt--;
  1178.             }
  1179.         } else {
  1180.             row->m_StartIt = row->m_Starts.end();
  1181. #if _DEBUG
  1182.             string errstr =
  1183.                 string("CAlnMix::x_CreateSegmentsVector():") +
  1184.                 " Internal error: no starts for row " +
  1185.                 NStr::IntToString(row->m_RowIndex) + ".";
  1186.             NCBI_THROW(CAlnException, eMergeFailure, errstr);
  1187. #endif
  1188.         }
  1189.     }
  1190. #if _DEBUG
  1191.     ITERATE (TSeqs, row_i, m_Rows) {
  1192.         ITERATE (CAlnMixSegment::TStarts, st_i, (*row_i)->m_Starts) {
  1193.             ITERATE(CAlnMixSegment::TStartIterators,
  1194.                     st_it_i, (*st_i).second->m_StartIts) {
  1195.                 // both should point to the same seg
  1196.                 if ((*st_it_i).second->second != (*st_i).second) {
  1197.                     string errstr =
  1198.                         string("CAlnMix::x_CreateSegmentsVector():")
  1199.                         + " Internal error: Segments messed up."
  1200.                         + " row=" + NStr::IntToString((*row_i)->m_RowIndex)
  1201.                         + " seq=" + NStr::IntToString((*row_i)->m_SeqIndex)
  1202.                         + " strand=" +
  1203.                         ((*row_i)->m_PositiveStrand ? "plus" : "minus");
  1204.                     NCBI_THROW(CAlnException, eMergeFailure, errstr);
  1205.                 }
  1206.             }
  1207.         }
  1208.          
  1209.     }       
  1210. #endif
  1211.     TSeqs::iterator refseq_it = m_Rows.begin();
  1212.     bool orig_refseq = true;
  1213.     while (true) {
  1214.         CAlnMixSeq * refseq = 0;
  1215.         while (refseq_it != m_Rows.end()) {
  1216.             refseq = *(refseq_it++);
  1217.             if (refseq->m_StartIt != refseq->m_Starts.end()) {
  1218.                 break;
  1219.             } else {
  1220.                 refseq = 0;
  1221.             }
  1222.         }
  1223.         if ( !refseq ) {
  1224.             // Done
  1225.             // add the gapped segments if any
  1226.             if (gapped_segs.size()) {
  1227.                 if (m_MergeFlags & fGapJoin) {
  1228.                     // request to try to align
  1229.                     // gapped segments w/ equal len
  1230.                     x_ConsolidateGaps(gapped_segs);
  1231.                 } else if (m_MergeFlags & fMinGap) {
  1232.                     // request to try to align 
  1233.                     // all gapped segments
  1234.                     x_MinimizeGaps(gapped_segs);
  1235.                 }
  1236.                 NON_CONST_ITERATE (TSegmentsContainer,
  1237.                                    seg_i, gapped_segs) {
  1238.                     m_Segments.push_back(&**seg_i);
  1239.                 }
  1240.                 gapped_segs.clear();
  1241.             }
  1242.             break; // from the main loop
  1243.         }
  1244.         // for each refseq segment
  1245.         while (refseq->m_StartIt != refseq->m_Starts.end()) {
  1246.             stack< CRef<CAlnMixSegment> > seg_stack;
  1247.             seg_stack.push(refseq->m_StartIt->second);
  1248.             
  1249.             while ( !seg_stack.empty() ) {
  1250.                 
  1251.                 bool pop_seg = true;
  1252.                 // check the gapped segments on the left
  1253.                 ITERATE (CAlnMixSegment::TStartIterators, start_its_i,
  1254.                          seg_stack.top()->m_StartIts) {
  1255.                     CAlnMixSeq * row = start_its_i->first;
  1256.                     if (row->m_StartIt != start_its_i->second) {
  1257. #if _DEBUG
  1258.                         if (row->m_PositiveStrand ?
  1259.                             row->m_StartIt->first >
  1260.                             start_its_i->second->first :
  1261.                             row->m_StartIt->first <
  1262.                             start_its_i->second->first) {
  1263.                             string errstr =
  1264.                                 string("CAlnMix::x_CreateSegmentsVector():")
  1265.                                 + " Internal error: Integrity broken" +
  1266.                                 " row=" + NStr::IntToString(row->m_RowIndex) +
  1267.                                 " seq=" + NStr::IntToString(row->m_SeqIndex)
  1268.                                 + " row->m_StartIt->first="
  1269.                                 + NStr::IntToString(row->m_StartIt->first)
  1270.                                 + " start_its_i->second->first=" +
  1271.                                 NStr::IntToString(start_its_i->second->first)
  1272.                                 + " refseq->m_StartIt->first=" +
  1273.                                 NStr::IntToString(refseq->m_StartIt->first)
  1274.                                 + " strand=" +
  1275.                                 (row->m_PositiveStrand ? "plus" : "minus");
  1276.                             NCBI_THROW(CAlnException, eMergeFailure, errstr);
  1277.                         }
  1278. #endif
  1279.                         seg_stack.push(row->m_StartIt->second);
  1280.                         pop_seg = false;
  1281.                         break;
  1282.                     }
  1283.                 }
  1284.                 if (pop_seg) {
  1285.                     // inc/dec iterators for each row of the seg
  1286.                     ITERATE (CAlnMixSegment::TStartIterators, start_its_i,
  1287.                              seg_stack.top()->m_StartIts) {
  1288.                         CAlnMixSeq * row = start_its_i->first;
  1289. #if _DEBUG
  1290.                         if (row->m_PositiveStrand  &&
  1291.                             row->m_StartIt->first > 
  1292.                             start_its_i->second->first  ||
  1293.                             !row->m_PositiveStrand  &&
  1294.                             row->m_StartIt->first <
  1295.                             start_its_i->second->first) {
  1296.                             string errstr =
  1297.                                 string("CAlnMix::x_CreateSegmentsVector():")
  1298.                                 + " Internal error: Integrity broken" +
  1299.                                 " row=" + NStr::IntToString(row->m_RowIndex) +
  1300.                                 " seq=" + NStr::IntToString(row->m_SeqIndex)
  1301.                                 + " row->m_StartIt->first="
  1302.                                 + NStr::IntToString(row->m_StartIt->first)
  1303.                                 + " start_its_i->second->first=" +
  1304.                                 NStr::IntToString(start_its_i->second->first)
  1305.                                 + " refseq->m_StartIt->first=" +
  1306.                                 NStr::IntToString(refseq->m_StartIt->first)
  1307.                                 + " strand=" +
  1308.                                 (row->m_PositiveStrand ? "plus" : "minus");
  1309.                             NCBI_THROW(CAlnException, eMergeFailure, errstr);
  1310.                         }
  1311. #endif
  1312.                         if (row->m_PositiveStrand) {
  1313.                             row->m_StartIt++;
  1314.                         } else {
  1315.                             if (row->m_StartIt == row->m_Starts.begin()) {
  1316.                                 row->m_StartIt = row->m_Starts.end();
  1317.                             } else {
  1318.                                 row->m_StartIt--;
  1319.                             }
  1320.                         }
  1321.                     }
  1322.                     if (seg_stack.size() > 1) {
  1323.                         // add to the gapped segments
  1324.                         gapped_segs.push_back(seg_stack.top());
  1325.                         seg_stack.pop();
  1326.                     } else {
  1327.                         // add the gapped segments if any
  1328.                         if (gapped_segs.size()) {
  1329.                             if (m_MergeFlags & fGapJoin) {
  1330.                                 // request to try to align
  1331.                                 // gapped segments w/ equal len
  1332.                                 x_ConsolidateGaps(gapped_segs);
  1333.                             } else if (m_MergeFlags & fMinGap) {
  1334.                                 // request to try to align 
  1335.                                 // all gapped segments
  1336.                                 x_MinimizeGaps(gapped_segs);
  1337.                             }
  1338.                             if (orig_refseq) {
  1339.                                 NON_CONST_ITERATE (TSegmentsContainer,
  1340.                                                    seg_i, gapped_segs) {
  1341.                                     m_Segments.push_back(&**seg_i);
  1342.                                 }
  1343.                                 gapped_segs.clear();
  1344.                             }
  1345.                         }
  1346.                         // add the refseq segment
  1347.                         if (orig_refseq) {
  1348.                             m_Segments.push_back(seg_stack.top());
  1349.                         } else {
  1350.                             gapped_segs.push_back(seg_stack.top());
  1351.                         }
  1352.                         seg_stack.pop();
  1353.                     } // if (seg_stack.size() > 1)
  1354.                 } // if (popseg)
  1355.             } // while ( !seg_stack.empty() )
  1356.         } // while (refseq->m_StartIt != refseq->m_Starts.end())
  1357.         orig_refseq = false;
  1358.     } // while (true)
  1359.     if (m_MergeFlags & fFillUnalignedRegions) {
  1360.         vector<TSignedSeqPos> starts;
  1361.         vector<TSeqPos> lens;
  1362.         starts.resize(m_Rows.size(), -1);
  1363.         lens.resize(m_Rows.size(), 0);
  1364.         TSeqPos len = 0;
  1365.         CAlnMap::TNumrow rowidx;
  1366.         TSegments::iterator seg_i = m_Segments.begin();
  1367.         while (seg_i != m_Segments.end()) {
  1368.             len = (*seg_i)->m_Len;
  1369.             ITERATE (CAlnMixSegment::TStartIterators, start_its_i,
  1370.                      (*seg_i)->m_StartIts) {
  1371.                 CAlnMixSeq * row = start_its_i->first;
  1372.                 rowidx = row->m_RowIndex;
  1373.                 TSignedSeqPos& prev_start = starts[rowidx];
  1374.                 TSeqPos& prev_len = lens[rowidx];
  1375.                 TSeqPos start = start_its_i->second->first;
  1376.                 const bool plus = row->m_PositiveStrand;
  1377.                 const int& width = row->m_Width;
  1378.                 TSeqPos prev_start_plus_len = prev_start + prev_len * width;
  1379.                 TSeqPos start_plus_len = start + len * width;
  1380.                 if (prev_start >= 0) {
  1381.                     if (plus  &&  prev_start_plus_len < start  ||
  1382.                         !plus  &&  start_plus_len < (TSeqPos) prev_start) {
  1383.                         // create a new seg
  1384.                         CRef<CAlnMixSegment> seg (new CAlnMixSegment);
  1385.                         TSeqPos new_start;
  1386.                         if (row->m_PositiveStrand) {
  1387.                             new_start = prev_start + prev_len * width;
  1388.                             seg->m_Len = (start - new_start) / width;
  1389.                         } else {
  1390.                             new_start = start_plus_len;
  1391.                             seg->m_Len = (prev_start - new_start) / width;
  1392.                         }                            
  1393.                         row->m_Starts[new_start] = seg;
  1394.                         CAlnMixSeq::TStarts::iterator start_i =
  1395.                             start_its_i->second;
  1396.                         seg->m_StartIts[row] = 
  1397.                             row->m_PositiveStrand ?
  1398.                             --start_i :
  1399.                             ++start_i;
  1400.                             
  1401.                         seg_i = m_Segments.insert(seg_i, seg);
  1402.                         seg_i++;
  1403.                     }
  1404.                 }
  1405.                 prev_start = start;
  1406.                 prev_len = len;
  1407.             }
  1408.             seg_i++;
  1409.         }
  1410.     }
  1411. }
  1412. void CAlnMix::x_ConsolidateGaps(TSegmentsContainer& gapped_segs)
  1413. {
  1414.     TSegmentsContainer::iterator seg1_i, seg2_i;
  1415.     seg2_i = seg1_i = gapped_segs.begin();
  1416.     if (seg2_i != gapped_segs.end()) {
  1417.         seg2_i++;
  1418.     }
  1419.     bool         cache = false;
  1420.     string       s1;
  1421.     TSeqPos      start1;
  1422.     int          score1;
  1423.     CAlnMixSeq * seq1;
  1424.     CAlnMixSeq * seq2;
  1425.     while (seg2_i != gapped_segs.end()) {
  1426.         CAlnMixSegment * seg1 = *seg1_i;
  1427.         CAlnMixSegment * seg2 = *seg2_i;
  1428.         // check if this seg possibly aligns with the previous one
  1429.         bool possible = true;
  1430.             
  1431.         if (seg2->m_Len == seg1->m_Len  && 
  1432.             seg2->m_StartIts.size() == 1) {
  1433.             seq2 = seg2->m_StartIts.begin()->first;
  1434.             // check if this seq was already used
  1435.             ITERATE (CAlnMixSegment::TStartIterators,
  1436.                      st_it,
  1437.                      (*seg1_i)->m_StartIts) {
  1438.                 if (st_it->first == seq2) {
  1439.                     possible = false;
  1440.                     break;
  1441.                 }
  1442.             }
  1443.             // check if score is sufficient
  1444.             if (possible  &&  m_AddFlags & fCalcScore) {
  1445.                 if (!cache) {
  1446.                     seq1 = seg1->m_StartIts.begin()->first;
  1447.                     
  1448.                     start1 = seg1->m_StartIts[seq1]->first;
  1449.                     TSeqPos start1_plus_len = 
  1450.                         start1 + seg1->m_Len * seq1->m_Width;
  1451.                         
  1452.                     if (seq1->m_PositiveStrand) {
  1453.                         seq1->m_BioseqHandle->GetSeqVector
  1454.                             (CBioseq_Handle::eCoding_Iupac,
  1455.                              CBioseq_Handle::eStrand_Plus).
  1456.                             GetSeqData(start1, start1_plus_len, s1);
  1457.                     } else {
  1458.                         CSeqVector seq_vec = 
  1459.                             seq1->m_BioseqHandle->GetSeqVector
  1460.                             (CBioseq_Handle::eCoding_Iupac,
  1461.                              CBioseq_Handle::eStrand_Minus);
  1462.                         TSeqPos size = seq_vec.size();
  1463.                         seq_vec.GetSeqData(size - start1_plus_len,
  1464.                                            size - start1, 
  1465.                                            s1);
  1466.                     }                                
  1467.                     score1 = 
  1468.                         CAlnVec::CalculateScore(s1, s1,
  1469.                                                 seq1->m_IsAA,
  1470.                                                 seq1->m_IsAA);
  1471.                     cache = true;
  1472.                 }
  1473.                 
  1474.                 string s2;
  1475.                 const TSeqPos& start2 = seg2->m_StartIts[seq2]->first;
  1476.                 TSeqPos start2_plus_len = 
  1477.                     start2 + seg2->m_Len * seq2->m_Width;
  1478.                             
  1479.                 if (seq2->m_PositiveStrand) {
  1480.                     seq2->m_BioseqHandle->GetSeqVector
  1481.                         (CBioseq_Handle::eCoding_Iupac,
  1482.                          CBioseq_Handle::eStrand_Plus).
  1483.                         GetSeqData(start2, start2_plus_len, s2);
  1484.                 } else {
  1485.                     CSeqVector seq_vec = 
  1486.                         seq2->m_BioseqHandle->GetSeqVector
  1487.                         (CBioseq_Handle::eCoding_Iupac,
  1488.                          CBioseq_Handle::eStrand_Minus);
  1489.                     TSeqPos size = seq_vec.size();
  1490.                     seq_vec.GetSeqData(size - start2_plus_len,
  1491.                                        size - start2, 
  1492.                                        s2);
  1493.                 }                                
  1494.                 int score2 = 
  1495.                     CAlnVec::CalculateScore(s1, s2, seq1->m_IsAA, seq2->m_IsAA);
  1496.                 if (score2 < 75 * score1 / 100) {
  1497.                     possible = false;
  1498.                 }
  1499.             }
  1500.             
  1501.         } else {
  1502.             possible = false;
  1503.         }
  1504.         if (possible) {
  1505.             // consolidate the ones so far
  1506.             
  1507.             // add the new row
  1508.             seg1->m_StartIts[seq2] = seg2->m_StartIts.begin()->second;
  1509.             
  1510.             // point the row's start position to the beginning seg
  1511.             seg2->m_StartIts.begin()->second->second = seg1;
  1512.             
  1513.             seg2_i = gapped_segs.erase(seg2_i);
  1514.         } else {
  1515.             cache = false;
  1516.             seg1_i++;
  1517.             seg2_i++;
  1518.         }
  1519.     }
  1520. }
  1521. void CAlnMix::x_MinimizeGaps(TSegmentsContainer& gapped_segs)
  1522. {
  1523.     TSegmentsContainer::iterator  seg_i, seg_i_end, seg_i_begin;
  1524.     CAlnMixSegment       * seg1, * seg2;
  1525.     CRef<CAlnMixSegment> seg;
  1526.     CAlnMixSeq           * seq;
  1527.     TSegmentsContainer            new_segs;
  1528.     seg_i_begin = seg_i_end = seg_i = gapped_segs.begin();
  1529.     typedef map<TSeqPos, CRef<CAlnMixSegment> > TLenMap;
  1530.     TLenMap len_map;
  1531.     while (seg_i_end != gapped_segs.end()) {
  1532.         len_map[(*seg_i_end)->m_Len];
  1533.         
  1534.         // check if this seg can possibly be minimized
  1535.         bool possible = true;
  1536.         seg_i = seg_i_begin;
  1537.         while (seg_i != seg_i_end) {
  1538.             seg1 = *seg_i;
  1539.             seg2 = *seg_i_end;
  1540.             
  1541.             ITERATE (CAlnMixSegment::TStartIterators,
  1542.                      st_it,
  1543.                      seg2->m_StartIts) {
  1544.                 seq = st_it->first;
  1545.                 // check if this seq was already used
  1546.                 if (seg1->m_StartIts.find(seq) != seg1->m_StartIts.end()) {
  1547.                     possible = false;
  1548.                     break;
  1549.                 }
  1550.             }
  1551.             if ( !possible ) {
  1552.                 break;
  1553.             }
  1554.             seg_i++;
  1555.         }
  1556.         seg_i_end++;
  1557.         if ( !possible  ||  seg_i_end == gapped_segs.end()) {
  1558.             // use the accumulated len info to create the new segs
  1559.             // create new segs with appropriate len
  1560.             TSeqPos len_so_far = 0;
  1561.             TLenMap::iterator len_i = len_map.begin();
  1562.             while (len_i != len_map.end()) {
  1563.                 len_i->second = new CAlnMixSegment();
  1564.                 len_i->second->m_Len = len_i->first - len_so_far;
  1565.                 len_so_far += len_i->second->m_Len;
  1566.                 len_i++;
  1567.             }
  1568.                 
  1569.             // loop through the accumulated orig segs.
  1570.             // copy info from them into the new segs
  1571.             TLenMap::iterator len_i_end;
  1572.             seg_i = seg_i_begin;
  1573.             while (seg_i != seg_i_end) {
  1574.                 TSeqPos orig_len = (*seg_i)->m_Len;
  1575.                 // determine the span of the current seg
  1576.                 len_i_end = len_map.find(orig_len);
  1577.                 len_i_end++;
  1578.                 // loop through its sequences
  1579.                 NON_CONST_ITERATE (CAlnMixSegment::TStartIterators,
  1580.                                    st_it,
  1581.                                    (*seg_i)->m_StartIts) {
  1582.                     seq = st_it->first;
  1583.                     TSeqPos orig_start = st_it->second->first;
  1584.                     len_i = len_map.begin();
  1585.                     len_so_far = 0;
  1586.                     // loop through the new segs
  1587.                     while (len_i != len_i_end) {
  1588.                         seg = len_i->second;
  1589.                     
  1590.                         // calc the start
  1591.                         TSeqPos this_start = orig_start + 
  1592.                             (seq->m_PositiveStrand ? 
  1593.                              len_so_far :
  1594.                              orig_len - len_so_far - seg->m_Len) *
  1595.                             seq->m_Width;
  1596.                         // create the bindings:
  1597.                         seq->m_Starts[this_start] = seg;
  1598.                         seg->m_StartIts[seq] = seq->m_Starts.find(this_start);
  1599.                         len_i++;
  1600.                         len_so_far += seg->m_Len;
  1601.                     }
  1602.                 }
  1603.                 seg_i++;
  1604.             }
  1605.             NON_CONST_ITERATE (TLenMap, len_it, len_map) {
  1606.                 new_segs.push_back(len_it->second);
  1607.             }
  1608.             len_map.clear();
  1609.             seg_i_begin = seg_i_end;
  1610.         }
  1611.     }
  1612.     gapped_segs.clear();
  1613.     ITERATE (TSegmentsContainer, new_seg_i, new_segs) {
  1614.         gapped_segs.push_back(*new_seg_i);
  1615.     }
  1616. }
  1617. void CAlnMix::x_CreateDenseg()
  1618. {
  1619.     int numrow  = 0,
  1620.         numrows = m_Rows.size();
  1621.     int numseg  = 0,
  1622.         numsegs = m_Segments.size();
  1623.     int num     = numrows * numsegs;
  1624.     m_DS = new CDense_seg();
  1625.     m_DS->SetDim(numrows);
  1626.     m_DS->SetNumseg(numsegs);
  1627.     m_Aln = new CSeq_align();
  1628.     m_Aln->SetType(CSeq_align::eType_not_set);
  1629.     m_Aln->SetSegs().SetDenseg(*m_DS);
  1630.     m_Aln->SetDim(numrows);
  1631.     CDense_seg::TIds&     ids     = m_DS->SetIds();
  1632.     CDense_seg::TStarts&  starts  = m_DS->SetStarts();
  1633.     CDense_seg::TStrands& strands = m_DS->SetStrands();
  1634.     CDense_seg::TLens&    lens    = m_DS->SetLens();
  1635.     ids.resize(numrows);
  1636.     lens.resize(numsegs);
  1637.     starts.resize(num, -1);
  1638.     strands.resize(num, eNa_strand_minus);
  1639.     // ids
  1640.     for (numrow = 0;  numrow < numrows;  numrow++) {
  1641.         ids[numrow] = m_Rows[numrow]->m_SeqId;
  1642.     }
  1643.     int offset = 0;
  1644.     for (numseg = 0;  numseg < numsegs;  numseg++, offset += numrows) {
  1645.         // lens
  1646.         lens[numseg] = m_Segments[numseg]->m_Len;
  1647.         // starts
  1648.         ITERATE (CAlnMixSegment::TStartIterators, start_its_i,
  1649.                 m_Segments[numseg]->m_StartIts) {
  1650.             starts[offset + start_its_i->first->m_RowIndex] =
  1651.                 start_its_i->second->first;
  1652.         }
  1653.         // strands
  1654.         for (numrow = 0;  numrow < numrows;  numrow++) {
  1655.             if (m_Rows[numrow]->m_PositiveStrand) {
  1656.                 strands[offset + numrow] = eNa_strand_plus;
  1657.             }
  1658.         }
  1659.     }
  1660.     // widths
  1661.     if (m_ContainsNA  &&  m_ContainsAA  ||  m_AddFlags & fForceTranslation) {
  1662.         CDense_seg::TWidths&  widths  = m_DS->SetWidths();
  1663.         widths.resize(numrows);
  1664.         for (numrow = 0;  numrow < numrows;  numrow++) {
  1665.             widths[numrow] = m_Rows[numrow]->m_Width;
  1666.         }
  1667.     }
  1668. #if _DEBUG
  1669.     m_DS->Validate(true);
  1670. #endif    
  1671. }
  1672. CRef<CDense_seg> CAlnMix::x_ExtendDSWithWidths(const CDense_seg& ds)
  1673. {
  1674.     if (ds.IsSetWidths()) {
  1675.         NCBI_THROW(CAlnException, eMergeFailure,
  1676.                    "CAlnMix::x_ExtendDSWithWidths(): "
  1677.                    "Widths already exist for the input alignment");
  1678.     }
  1679.     bool contains_AA = false, contains_NA = false;
  1680.     CRef<CAlnMixSeq> aln_seq;
  1681.     for (size_t numrow = 0;  numrow < ds.GetDim();  numrow++) {
  1682.         x_IdentifyAlnMixSeq(aln_seq, *ds.GetIds()[numrow]);
  1683.         if (aln_seq->m_IsAA) {
  1684.             contains_AA = true;
  1685.         } else {
  1686.             contains_NA = true;
  1687.         }
  1688.     }
  1689.     if (contains_AA  &&  contains_NA) {
  1690.         NCBI_THROW(CAlnException, eMergeFailure,
  1691.                    "CAlnMix::x_ExtendDSWithWidths(): "
  1692.                    "Incorrect input Dense-seg: Contains both AAs and NAs but "
  1693.                    "widths do not exist!");
  1694.     }        
  1695.     CRef<CDense_seg> new_ds(new CDense_seg());
  1696.     // copy from the original
  1697.     new_ds->Assign(ds);
  1698.     if (contains_NA) {
  1699.         // fix the lengths
  1700.         const CDense_seg::TLens& lens     = ds.GetLens();
  1701.         CDense_seg::TLens&       new_lens = new_ds->SetLens();
  1702.         for (size_t numseg = 0; numseg < ds.GetNumseg(); numseg++) {
  1703.             if (lens[numseg] % 3) {
  1704.                 string errstr =
  1705.                     string("CAlnMix::x_ExtendDSWithWidths(): ") +
  1706.                     "Length of segment " + NStr::IntToString(numseg) +
  1707.                     " is not divisible by 3.";
  1708.                 NCBI_THROW(CAlnException, eMergeFailure, errstr);
  1709.             } else {
  1710.                 new_lens[numseg] = lens[numseg] / 3;
  1711.             }
  1712.         }
  1713.     }
  1714.     // add the widths
  1715.     CDense_seg::TWidths&  new_widths  = new_ds->SetWidths();
  1716.     new_widths.resize(ds.GetDim(), contains_NA ? 3 : 1);
  1717. #if _DEBUG
  1718.     new_ds->Validate(true);
  1719. #endif
  1720.     return new_ds;
  1721. }
  1722. void CAlnMix::x_IdentifyAlnMixSeq(CRef<CAlnMixSeq>& aln_seq, const CSeq_id& seq_id)
  1723. {
  1724.     CBioseq_Handle bioseq_handle = 
  1725.         GetScope().GetBioseqHandle(seq_id);
  1726.     if ( !bioseq_handle ) {
  1727.         string errstr = string("CAlnMix::x_IdentifyAlnMixSeq(): ") 
  1728.             + "Seq-id cannot be resolved: "
  1729.             + (seq_id.AsFastaString());
  1730.         
  1731.         NCBI_THROW(CAlnException, eInvalidSeqId, errstr);
  1732.     }
  1733.     TBioseqHandleMap::iterator it = m_BioseqHandles.find(bioseq_handle);
  1734.     if (it == m_BioseqHandles.end()) {
  1735.         // add this bioseq handle
  1736.         aln_seq = new CAlnMixSeq();
  1737.         m_BioseqHandles[bioseq_handle] = aln_seq;
  1738.         aln_seq->m_BioseqHandle = 
  1739.             &m_BioseqHandles.find(bioseq_handle)->first;
  1740.         
  1741.         CRef<CSeq_id> seq_id(new CSeq_id);
  1742.         seq_id->Assign(*aln_seq->m_BioseqHandle->GetSeqId());
  1743.         aln_seq->m_SeqId = seq_id;
  1744.         aln_seq->m_DS_Count = 0;
  1745.         // add this sequence
  1746.         m_Seqs.push_back(aln_seq);
  1747.             
  1748.         // AA or NA?
  1749.         if (aln_seq->m_BioseqHandle->GetBioseqCore()
  1750.             ->GetInst().GetMol() == CSeq_inst::eMol_aa) {
  1751.             aln_seq->m_IsAA = true;
  1752.             m_ContainsAA = true;
  1753.         } else {
  1754.             m_ContainsNA = true;
  1755.         }
  1756.     } else {
  1757.         aln_seq = it->second;
  1758.     }
  1759. }
  1760. void CAlnMix::ChooseSeqId(CSeq_id& id1, const CSeq_id& id2)
  1761. {
  1762.     CRef<CAlnMixSeq> aln_seq1, aln_seq2;
  1763.     x_IdentifyAlnMixSeq(aln_seq1, id1);
  1764.     x_IdentifyAlnMixSeq(aln_seq2, id2);
  1765.     if (aln_seq1->m_BioseqHandle != aln_seq2->m_BioseqHandle) {
  1766.         string errstr = 
  1767.             string("CAlnMix::ChooseSeqId(CSeq_id& id1, const CSeq_id& id2):")
  1768.             + " Seq-ids: " + id1.AsFastaString() 
  1769.             + " and " + id2.AsFastaString() 
  1770.             + " do not resolve to the same bioseq handle,"
  1771.             " but are used on the same 'row' in different segments."
  1772.             " This is legally allowed in a Std-seg, but conversion to"
  1773.             " Dense-seg cannot be performed.";
  1774.         NCBI_THROW(CAlnException, eInvalidSeqId, errstr);
  1775.     }
  1776.     CRef<CSeq_id> id1cref(&id1);
  1777.     CRef<CSeq_id> id2cref(&(const_cast<CSeq_id&>(id2)));
  1778.     if (CSeq_id::BestRank(id1cref) > CSeq_id::BestRank(id2cref)) {
  1779.         id1.Reset();
  1780.         SerialAssign<CSeq_id>(id1, id2);
  1781.     }
  1782. }    
  1783. END_objects_SCOPE // namespace ncbi::objects::
  1784. END_NCBI_SCOPE
  1785. /*
  1786. * ===========================================================================
  1787. *
  1788. * $Log: alnmix.cpp,v $
  1789. * Revision 1000.5  2004/06/01 19:40:45  gouriano
  1790. * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.95
  1791. *
  1792. * Revision 1.95  2004/05/25 19:16:33  todorov
  1793. * initialize second_row_fits
  1794. *
  1795. * Revision 1.94  2004/05/25 16:00:10  todorov
  1796. * remade truncation of overlaps
  1797. *
  1798. * Revision 1.93  2004/05/21 21:42:51  gorelenk
  1799. * Added PCH ncbi_pch.hpp
  1800. *
  1801. * Revision 1.92  2004/04/13 18:02:10  todorov
  1802. * Added some limit checks when inc/dec iterators
  1803. *
  1804. * Revision 1.91  2004/03/30 23:27:32  todorov
  1805. * Switch from CAlnMix::x_RankSeqId() to CSeq_id::BestRank()
  1806. *
  1807. * Revision 1.90  2004/03/30 20:41:29  todorov
  1808. * Rearranged the rank of seq-ids according to the C toolkit rearrangement
  1809. *
  1810. * Revision 1.89  2004/03/29 17:05:06  todorov
  1811. * extended exception msg
  1812. *
  1813. * Revision 1.88  2004/03/09 17:40:07  kuznets
  1814. * Compilation bug fix CAlnMix::ChooseSeqId
  1815. *
  1816. * Revision 1.87  2004/03/09 17:16:16  todorov
  1817. * + SSeqIdChooser implementation
  1818. *
  1819. * Revision 1.86  2004/01/16 23:59:45  todorov
  1820. * + missing width in seg calcs
  1821. *
  1822. * Revision 1.85  2003/12/22 19:30:35  kuznets
  1823. * Fixed compilation error (operator ambiguity) (MSVC)
  1824. *
  1825. * Revision 1.84  2003/12/22 18:30:37  todorov
  1826. * ObjMgr is no longer created internally. Scope should be passed as a reference in the ctor
  1827. *
  1828. * Revision 1.83  2003/12/18 19:46:27  todorov
  1829. * iterate -> ITERATE
  1830. *
  1831. * Revision 1.82  2003/12/12 22:42:53  todorov
  1832. * Init frames and use refseq instead of seq1 which may be refseqs child
  1833. *
  1834. * Revision 1.81  2003/12/10 16:14:38  todorov
  1835. * + exception when merge with no input
  1836. *
  1837. * Revision 1.80  2003/12/08 21:28:03  todorov
  1838. * Forced Translation of Nucleotide Sequences
  1839. *
  1840. * Revision 1.79  2003/11/24 17:11:32  todorov
  1841. * SetWidths only if necessary
  1842. *
  1843. * Revision 1.78  2003/11/03 14:43:44  todorov
  1844. * Use CDense_seg::Validate()
  1845. *
  1846. * Revision 1.77  2003/10/03 19:22:55  todorov
  1847. * Extended exception msg in case of empty row
  1848. *
  1849. * Revision 1.76  2003/09/16 14:45:56  todorov
  1850. * more informative exception strng
  1851. *
  1852. * Revision 1.75  2003/09/12 16:18:36  todorov
  1853. * -unneeded checks (">=" for unsigned)
  1854. *
  1855. * Revision 1.74  2003/09/12 15:42:36  todorov
  1856. * +CRef to delay obj deletion while iterating on it
  1857. *
  1858. * Revision 1.73  2003/09/08 20:44:21  todorov
  1859. * expression bug fix
  1860. *
  1861. * Revision 1.72  2003/09/08 19:48:55  todorov
  1862. * signed vs unsigned warnings fixed
  1863. *
  1864. * Revision 1.71  2003/09/08 19:24:04  todorov
  1865. * fix strand
  1866. *
  1867. * Revision 1.70  2003/09/08 19:18:32  todorov
  1868. * plus := all strands != minus
  1869. *
  1870. * Revision 1.69  2003/09/06 02:39:34  todorov
  1871. * OBJECTS_ALNMGR___ALNMIX__DBG -> _DEBUG
  1872. *
  1873. * Revision 1.68  2003/08/28 19:54:58  todorov
  1874. * trailing gap on master fix
  1875. *
  1876. * Revision 1.67  2003/08/20 19:35:32  todorov
  1877. * Added x_ValidateDenseg
  1878. *
  1879. * Revision 1.66  2003/08/20 14:34:58  todorov
  1880. * Support for NA2AA Densegs
  1881. *
  1882. * Revision 1.65  2003/08/01 20:49:26  todorov
  1883. * Changed the control of the main Merge() loop
  1884. *
  1885. * Revision 1.64  2003/07/31 18:50:29  todorov
  1886. * Fixed a couple of truncation problems; Added a match deletion
  1887. *
  1888. * Revision 1.63  2003/07/15 20:54:01  todorov
  1889. * exception type fixed
  1890. *
  1891. * Revision 1.62  2003/07/09 22:58:12  todorov
  1892. * row index bug fix in case of fillunaln
  1893. *
  1894. * Revision 1.61  2003/07/01 17:40:20  todorov
  1895. * fFillUnaligned bug fix
  1896. *
  1897. * Revision 1.60  2003/06/26 21:35:48  todorov
  1898. * + fFillUnalignedRegions
  1899. *
  1900. * Revision 1.59  2003/06/25 15:17:31  todorov
  1901. * truncation consistent for the whole segment now
  1902. *
  1903. * Revision 1.58  2003/06/24 15:24:14  todorov
  1904. * added optional truncation of overlaps
  1905. *
  1906. * Revision 1.57  2003/06/20 03:06:39  todorov
  1907. * Setting the seqalign type
  1908. *
  1909. * Revision 1.56  2003/06/19 18:37:19  todorov
  1910. * typo fixed
  1911. *
  1912. * Revision 1.55  2003/06/09 20:54:20  todorov
  1913. * Use of ObjMgr is now optional
  1914. *
  1915. * Revision 1.54  2003/06/03 20:56:52  todorov
  1916. * Bioseq handle validation
  1917. *
  1918. * Revision 1.53  2003/06/03 16:07:05  todorov
  1919. * Fixed overlap consistency check in case strands differ
  1920. *
  1921. * Revision 1.52  2003/06/03 14:38:26  todorov
  1922. * warning fixed
  1923. *
  1924. * Revision 1.51  2003/06/02 17:39:40  todorov
  1925. * Changes in rev 1.49 were lost. Reapplying them
  1926. *
  1927. * Revision 1.50  2003/06/02 16:06:40  dicuccio
  1928. * Rearranged src/objects/ subtree.  This includes the following shifts:
  1929. *     - src/objects/asn2asn --> arc/app/asn2asn
  1930. *     - src/objects/testmedline --> src/objects/ncbimime/test
  1931. *     - src/objects/objmgr --> src/objmgr
  1932. *     - src/objects/util --> src/objmgr/util
  1933. *     - src/objects/alnmgr --> src/objtools/alnmgr
  1934. *     - src/objects/flat --> src/objtools/flat
  1935. *     - src/objects/validator --> src/objtools/validator
  1936. *     - src/objects/cddalignview --> src/objtools/cddalignview
  1937. * In addition, libseq now includes six of the objects/seq... libs, and libmmdb
  1938. * replaces the three libmmdb? libs.
  1939. *
  1940. * Revision 1.49  2003/05/30 17:42:03  todorov
  1941. * 1) Bug fix in x_SecondRowFits
  1942. * 2) x_CreateSegmentsVector now uses a stack to order the segs
  1943. *
  1944. * Revision 1.48  2003/05/23 18:11:42  todorov
  1945. * More detailed exception txt
  1946. *
  1947. * Revision 1.47  2003/05/20 21:20:59  todorov
  1948. * mingap minus strand multiple segs bug fix
  1949. *
  1950. * Revision 1.46  2003/05/15 23:31:26  todorov
  1951. * Minimize gaps bug fix
  1952. *
  1953. * Revision 1.45  2003/05/09 16:41:27  todorov
  1954. * Optional mixing of the query sequence only
  1955. *
  1956. * Revision 1.44  2003/04/24 16:15:57  vasilche
  1957. * Added missing includes and forward class declarations.
  1958. *
  1959. * Revision 1.43  2003/04/15 14:21:12  vasilche
  1960. * Fixed order of member initializers.
  1961. *
  1962. * Revision 1.42  2003/04/14 18:03:19  todorov
  1963. * reuse of matches bug fix
  1964. *
  1965. * Revision 1.41  2003/04/01 20:25:31  todorov
  1966. * fixed iterator-- behaviour at the begin()
  1967. *
  1968. * Revision 1.40  2003/03/28 16:47:28  todorov
  1969. * Introduced TAddFlags (fCalcScore for now)
  1970. *
  1971. * Revision 1.39  2003/03/26 16:38:24  todorov
  1972. * mix independent densegs
  1973. *
  1974. * Revision 1.38  2003/03/18 17:16:05  todorov
  1975. * multirow inserts bug fix
  1976. *
  1977. * Revision 1.37  2003/03/12 15:39:47  kuznets
  1978. * iterate -> ITERATE
  1979. *
  1980. * Revision 1.36  2003/03/10 22:12:02  todorov
  1981. * fixed x_CompareAlnMatchScores callback
  1982. *
  1983. * Revision 1.35  2003/03/06 21:42:38  todorov
  1984. * bug fix in x_Reset()
  1985. *
  1986. * Revision 1.34  2003/03/05 21:53:24  todorov
  1987. * clean up miltiple mix case
  1988. *
  1989. * Revision 1.33  2003/03/05 17:42:41  todorov
  1990. * Allowing multiple mixes + general case speed optimization
  1991. *
  1992. * Revision 1.32  2003/03/05 16:18:17  todorov
  1993. * + str len err check
  1994. *
  1995. * Revision 1.31  2003/02/24 19:01:31  vasilche
  1996. * Use faster version of CSeq_id::Assign().
  1997. *
  1998. * Revision 1.30  2003/02/24 18:46:38  todorov
  1999. * Fixed a - strand row info creation bug
  2000. *
  2001. * Revision 1.29  2003/02/11 21:32:44  todorov
  2002. * fMinGap optional merging algorithm
  2003. *
  2004. * Revision 1.28  2003/02/04 00:05:16  todorov
  2005. * GetSeqData neg strand range bug
  2006. *
  2007. * Revision 1.27  2003/01/27 22:30:30  todorov
  2008. * Attune to seq_vector interface change
  2009. *
  2010. * Revision 1.26  2003/01/23 16:30:18  todorov
  2011. * Moved calc score to alnvec
  2012. *
  2013. * Revision 1.25  2003/01/22 21:00:21  dicuccio
  2014. * Fixed compile error - transposed #endif and }
  2015. *
  2016. * Revision 1.24  2003/01/22 19:39:13  todorov
  2017. * 1) Matches w/ the 1st non-gapped row added only; the rest used to calc seqs'
  2018. * score only
  2019. * 2) Fixed a couple of potential problems
  2020. *
  2021. * Revision 1.23  2003/01/17 18:56:26  todorov
  2022. * Perform merge algorithm even if only one input denseg
  2023. *
  2024. * Revision 1.22  2003/01/16 17:40:19  todorov
  2025. * Fixed strand param when calling GetSeqVector
  2026. *
  2027. * Revision 1.21  2003/01/10 17:37:28  todorov
  2028. * Fixed a bug in fNegativeStrand
  2029. *
  2030. * Revision 1.20  2003/01/10 15:12:13  dicuccio
  2031. * Ooops.  Methinks I should read my e-mail more thoroughly - thats '!= NULL', not
  2032. * '== NULL'.
  2033. *
  2034. * Revision 1.19  2003/01/10 15:11:12  dicuccio
  2035. * Small bug fixes: changed 'iterate' -> 'non_const_iterate' in x_Merge(); changed
  2036. * logic of while() in x_CreateRowsVector() to avoid compiler warning.
  2037. *
  2038. * Revision 1.18  2003/01/10 00:42:53  todorov
  2039. * Optional sorting of seqs by score
  2040. *
  2041. * Revision 1.17  2003/01/10 00:11:37  todorov
  2042. * Implemented fNegativeStrand
  2043. *
  2044. * Revision 1.16  2003/01/07 17:23:49  todorov
  2045. * Added conditionally compiled validation code
  2046. *
  2047. * Revision 1.15  2003/01/02 20:03:48  todorov
  2048. * Row strand init & check
  2049. *
  2050. * Revision 1.14  2003/01/02 16:40:11  todorov
  2051. * Added accessors to the input data
  2052. *
  2053. * Revision 1.13  2003/01/02 15:30:17  todorov
  2054. * Fixed the order of checks in x_SecondRowFits
  2055. *
  2056. * Revision 1.12  2002/12/30 20:55:47  todorov
  2057. * Added range fitting validation to x_SecondRowFits
  2058. *
  2059. * Revision 1.11  2002/12/30 18:08:39  todorov
  2060. * 1) Initialized extra rows' m_StartIt
  2061. * 2) Fixed a bug in x_SecondRowFits
  2062. *
  2063. * Revision 1.10  2002/12/27 23:09:56  todorov
  2064. * Additional inconsistency checks in x_SecondRowFits
  2065. *
  2066. * Revision 1.9  2002/12/27 17:27:13  todorov
  2067. * Force positive strand in all cases but negative
  2068. *
  2069. * Revision 1.8  2002/12/27 16:39:13  todorov
  2070. * Fixed a bug in the single Dense-seg case.
  2071. *
  2072. * Revision 1.7  2002/12/23 18:03:51  todorov
  2073. * Support for putting in and getting out Seq-aligns
  2074. *
  2075. * Revision 1.6  2002/12/19 00:09:23  todorov
  2076. * Added optional consolidation of segments that are gapped on the query.
  2077. *
  2078. * Revision 1.5  2002/12/18 18:58:17  ucko
  2079. * Tweak syntax to avoid confusing MSVC.
  2080. *
  2081. * Revision 1.4  2002/12/18 03:46:00  todorov
  2082. * created an algorithm for mixing alignments that share a single sequence.
  2083. *
  2084. * Revision 1.3  2002/10/25 20:02:41  todorov
  2085. * new fTryOtherMethodOnFail flag
  2086. *
  2087. * Revision 1.2  2002/10/24 21:29:13  todorov
  2088. * adding Dense-segs instead of Seq-aligns
  2089. *
  2090. * Revision 1.1  2002/08/23 14:43:52  ucko
  2091. * Add the new C++ alignment manager to the public tree (thanks, Kamen!)
  2092. *
  2093. *
  2094. * ===========================================================================
  2095. */