splign_compartment_finder.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:13k
源码类别:
生物技术
开发平台:
C/C++
- /*
- * ===========================================================================
- * PRODUCTION $Log: splign_compartment_finder.cpp,v $
- * PRODUCTION Revision 1000.0 2004/06/01 18:12:33 gouriano
- * PRODUCTION PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.5
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: splign_compartment_finder.cpp,v 1000.0 2004/06/01 18:12:33 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Author: Yuri Kapustin
- *
- * File Description: Compartment Finder
- *
- * ===========================================================================
- */
- #include <ncbi_pch.hpp>
- #include "splign_compartment_finder.hpp"
- #include <corelib/ncbi_limits.hpp>
- #include <algo/align/align_exception.hpp>
- #include <algorithm>
- BEGIN_NCBI_SCOPE
- void CCompartmentFinder::CCompartment::UpdateMinMax() {
- m_box[0] = m_box[2] = kMax_UInt;
- m_box[1] = m_box[3] = 0;
- ITERATE(vector<const CHit*>, ph, m_members) {
- const CHit* hit = *ph;
- if(size_t(hit->m_ai[0]) < m_box[0]) {
- m_box[0] = hit->m_ai[0];
- }
- if(size_t(hit->m_ai[2]) < m_box[2]) {
- m_box[2] = hit->m_ai[2];
- }
- if(size_t(hit->m_ai[1]) > m_box[1]) {
- m_box[1] = hit->m_ai[1];
- }
- if(size_t(hit->m_ai[3]) > m_box[3]) {
- m_box[3] = hit->m_ai[3];
- }
- }
- }
- bool CCompartmentFinder::CCompartment::GetStrand() const {
- if(m_members.size()) {
- return m_members[0]->IsStraight();
- }
- else {
- NCBI_THROW( CAlgoAlignException,
- eInternal,
- "Strand requested on an empty compartment" );
- }
- }
- CCompartmentFinder::CCompartmentFinder(vector<CHit>::const_iterator start,
- vector<CHit>::const_iterator finish):
- m_intron_min(GetDefaultIntronMin()),
- m_intron_max(GetDefaultIntronMax()),
- m_penalty(GetDefaultPenalty()),
- m_min_coverage(GetDefaultMinCoverage()),
- m_iter(-1)
- {
- const size_t dim = finish - start;
- if(dim > 0) {
- m_hits.resize(dim);
- const CHit* p0 = &(*start);
- for(size_t i = 0; i < dim; ++i) {
- m_hits[i] = p0++;
- }
- }
- }
- size_t CCompartmentFinder::Run()
- {
- m_compartments.clear();
- const size_t dimhits = m_hits.size();
- if(dimhits == 0) {
- return 0;
- }
- // sort here to make sure that later at every step
- // all potential sources (hits from where to continue)
- // are visible
- sort(m_hits.begin(), m_hits.end(), CHit::PSubjLow_QueryLow_ptr);
- // insert dummy element
- list<SQueryMark> qmarks; // ordered list of query marks
- qmarks.clear();
- qmarks.push_back(SQueryMark(0, 0, -1));
- const list<SQueryMark>::iterator li_b = qmarks.begin();
- // *** Generic hit iteration (could be optimized of course) ***
- // For every hit:
- // - find out if its query mark already exists (qm_new)
- // - for query mark below the current:
- // -- check if it could be extended
- // -- evaluate extension potential
- // - for every other qm:
- // -- check if its compartment could be terminated
- // -- evaluate potential of terminating the compartment
- // to open a new one
- // - select the best potential
- // - if qm_new, create the new query mark, otherwise
- // update the existing one if the new score is higher
- // - if query mark was created or updated,
- // save the hit's status ( previous hit, extension or opening)
- //
- vector<SHitStatus> hitstatus (dimhits);
- for(size_t i = 0; i < dimhits; ++i) {
- const list<SQueryMark>::iterator li_e = qmarks.end();
- const CHit& h = *m_hits[i];
- list<SQueryMark>::iterator li0 = lower_bound(li_b, li_e,
- SQueryMark(h.m_ai[1], 0, -2));
- bool qm_new = (li0 == li_e)? true: (size_t(h.m_ai[1]) < li0->m_coord);
- int best_ext_score = kMin_Int;
- list<SQueryMark>::iterator li_best_ext = li_b;
- int best_open_score = kMin_Int;
- list<SQueryMark>::iterator li_best_open = li_b;
- for( list<SQueryMark>::iterator li = li_b; li != li_e; ++li) {
- const CHit* phc = (li->m_hit == -1)? 0: m_hits[li->m_hit];
- // check if good for extention
- if(li->m_hit < int(i)) {
- size_t q0 = h.m_ai[0], s0 = h.m_ai[2]; // possible continuation point
- bool good;
- if(li->m_hit == -1) {
- good = false;
- }
- else {
- if(phc->m_ai[1] >= h.m_ai[1]) {
- good = false;
- }
- else {
- if(phc->m_ai[1] < h.m_ai[0]) {
- q0 = h.m_ai[0];
- s0 = h.m_ai[2];
- }
- else {
- // find where this point would be on h
- q0 = phc->m_ai[1] + 1;
- s0 += q0 - h.m_ai[0];
- }
- int intron = int(s0) - phc->m_ai[3] - 1;
- good = int(m_intron_min) <= intron && intron <= int(m_intron_max);
- }
- }
- if(good) {
- int ext_score = li->m_score +
- int(0.01 * h.m_Idnty * (h.m_ai[1] - q0 + 1));
- if(ext_score > best_ext_score) {
- best_ext_score = ext_score;
- li_best_ext = li;
- }
- }
- }
- // check if good for closing and opening
- if(li->m_hit == -1 || phc->m_ai[3] < h.m_ai[2]) {
- int score_open = li->m_score - m_penalty +
- int(0.01*h.m_Idnty*(h.m_ai[1] - h.m_ai[0] + 1));
- if(score_open > best_open_score) {
- best_open_score = score_open;
- li_best_open = li;
- }
- }
- }
- SHitStatus::EType hit_type;
- int prev_hit;
- int best_score;
- if(best_ext_score > best_open_score) {
- hit_type = SHitStatus::eExtension;
- prev_hit = li_best_ext->m_hit;
- best_score = best_ext_score;
- }
- else {
- hit_type = SHitStatus::eOpening;
- prev_hit = li_best_open->m_hit;
- best_score = best_open_score;
- }
- bool updated = false;
- if(qm_new) {
- qmarks.insert(li0, SQueryMark(h.m_ai[1], best_score, i));
- updated = true;
- }
- else {
- if(best_score > li0->m_score) {
- li0->m_score = best_score;
- li0->m_hit = i;
- updated = true;
- }
- }
- if(updated) {
- hitstatus[i].m_type = hit_type;
- hitstatus[i].m_prev = prev_hit;
- }
- }
- // *** backtrace ***
- // - find query mark with the highest score
- // - trace it back until the dummy
- list<SQueryMark>::iterator li_best = li_b;
- ++li_best;
- int score_best = kMin_Int;
- for(list<SQueryMark>::iterator li = li_best, li_e = qmarks.end();
- li != li_e; ++li) {
- if(li->m_score > score_best) {
- score_best = li->m_score;
- li_best = li;
- }
- }
- if( int(score_best + m_penalty) >= int(m_min_coverage) ) {
- int i = li_best->m_hit;
- CCompartment* pc = 0;
- bool new_compartment = true;
- while(i != -1) {
- if(new_compartment) {
- new_compartment = false;
- m_compartments.push_back(CCompartment());
- pc = &m_compartments.back();
- }
- pc->AddMember(m_hits[i]);
- if(hitstatus[i].m_type == SHitStatus::eOpening) {
- new_compartment = true;
- }
- i = hitstatus[i].m_prev;
- }
- }
- return m_compartments.size();
- }
- CCompartmentFinder::CCompartment* CCompartmentFinder::GetFirst()
- {
- if(m_compartments.size()) {
- m_iter = 0;
- return &m_compartments[m_iter++];
- }
- else {
- m_iter = -1;
- return 0;
- }
- }
- void CCompartmentFinder::OrderCompartments(void) {
- for(int i = 0, dim = m_compartments.size(); i < dim; ++i) {
- m_compartments[i].UpdateMinMax();
- }
- sort(m_compartments.begin(), m_compartments.end(), PLowerSubj);
- }
- CCompartmentFinder::CCompartment* CCompartmentFinder::GetNext()
- {
- const size_t dim = m_compartments.size();
- if(m_iter == -1 || m_iter >= int(dim)) {
- m_iter = -1;
- return 0;
- }
- else {
- return &m_compartments[m_iter++];
- }
- }
- CCompartmentAccessor::CCompartmentAccessor(vector<CHit>::iterator istart,
- vector<CHit>::iterator ifinish,
- size_t comp_penalty,
- size_t min_coverage)
- {
- // separate strands for CompartmentFinder
- vector<CHit>::iterator ib = istart, ie = ifinish, ii = ib, iplus_beg = ie;
- sort(ib, ie, CHit::PPrecedeStrand);
- size_t minus_subj_min = kMax_UInt, minus_subj_max = 0;
- for(ii = ib; ii != ie; ++ii) {
- if(ii->IsStraight()) {
- iplus_beg = ii;
- break;
- }
- else {
- if(size_t(ii->m_ai[2]) < minus_subj_min) {
- minus_subj_min = ii->m_ai[2];
- }
- if(size_t(ii->m_ai[3]) > minus_subj_max) {
- minus_subj_max = ii->m_ai[3];
- }
- }
- }
- // minus
- {{
- // flip
- for(ii = ib; ii != iplus_beg; ++ii) {
- int s0 = minus_subj_min + minus_subj_max - ii->m_ai[3];
- int s1 = minus_subj_min + minus_subj_max - ii->m_ai[2];
- ii->m_an[2] = ii->m_ai[2] = s0;
- ii->m_an[3] = ii->m_ai[3] = s1;
- }
- CCompartmentFinder finder (ib, iplus_beg);
- finder.SetMinCoverage(min_coverage);
- finder.SetPenalty(comp_penalty);
- finder.Run();
- // un-flip
- for(ii = ib; ii != iplus_beg; ++ii) {
- int s0 = minus_subj_min + minus_subj_max - ii->m_ai[3];
- int s1 = minus_subj_min + minus_subj_max - ii->m_ai[2];
- ii->m_an[3] = ii->m_ai[2] = s0;
- ii->m_an[2] = ii->m_ai[3] = s1;
- }
- x_Copy2Pending(finder);
- }}
- // plus
- {{
- CCompartmentFinder finder (iplus_beg, ie);
- finder.SetMinCoverage(min_coverage);
- finder.SetPenalty(comp_penalty);
- finder.Run();
- x_Copy2Pending(finder);
- }}
- }
- void CCompartmentAccessor::x_Copy2Pending(CCompartmentFinder& finder)
- {
- finder.OrderCompartments();
- // copy to pending
- for(CCompartmentFinder::CCompartment* compartment = finder.GetFirst(); compartment;
- compartment = finder.GetNext()) {
- m_pending.push_back(vector<CHit>(0));
- vector<CHit>& vh = m_pending.back();
- for(const CHit* ph = compartment->GetFirst(); ph; ph = compartment->GetNext()) {
- vh.push_back(*ph);
- }
- const size_t* box = compartment->GetBox();
- m_ranges.push_back(box[0]-1);
- m_ranges.push_back(box[1]-1);
- m_ranges.push_back(box[2]-1);
- m_ranges.push_back(box[3]-1);
- m_strands.push_back(compartment->GetStrand());
- }
- }
- bool CCompartmentAccessor::GetFirst(vector<CHit>& compartment) {
- m_iter = 0;
- return GetNext(compartment);
- }
- bool CCompartmentAccessor::GetNext(vector<CHit>& compartment) {
- compartment.clear();
- if(m_iter < m_pending.size()) {
- compartment = m_pending[m_iter++];
- return true;
- }
- else {
- return false;
- }
- }
- END_NCBI_SCOPE
- /*
- * ===========================================================================
- * $Log: splign_compartment_finder.cpp,v $
- * Revision 1000.0 2004/06/01 18:12:33 gouriano
- * PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.5
- *
- * Revision 1.5 2004/05/24 16:13:57 gorelenk
- * Added PCH ncbi_pch.hpp
- *
- * Revision 1.4 2004/05/18 21:43:40 kapustin
- * Code cleanup
- *
- * Revision 1.3 2004/04/23 14:37:44 kapustin
- * *** empty log message ***
- *
- * ==========================================================================
- */