splign_formatter.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:11k
- /*
- * ===========================================================================
- * PRODUCTION $Log: splign_formatter.cpp,v $
- * PRODUCTION Revision 1000.0 2004/06/01 18:12:43 gouriano
- * PRODUCTION PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.6
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: splign_formatter.cpp,v 1000.0 2004/06/01 18:12:43 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Author: Yuri Kapustin
- *
- * ===========================================================================
- *
- */
- #include <ncbi_pch.hpp>
- #include "splign_util.hpp"
- #include <objects/seqalign/Seq_align.hpp>
- #include <objects/seqloc/Seq_id.hpp>
- #include <algo/align/splign/splign_formatter.hpp>
- #include <objects/seqalign/Score.hpp>
- #include <objects/general/Object_id.hpp>
- #include <objects/seqalign/Dense_seg.hpp>
- BEGIN_NCBI_SCOPE
- USING_SCOPE(objects);
- CSplignFormatter::CSplignFormatter (const CSplign& splign):
- m_splign(&splign)
- {
- static const char kErrMsg[] = "ID_not_set";
- m_QueryId = m_SubjId = kErrMsg;
- }
- string CSplignFormatter::AsText(void) const
- {
- CNcbiOstrstream oss;
- oss.precision(3);
- const vector<CSplign::SAlignedCompartment>& acs = m_splign->GetResult();
- ITERATE(vector<CSplign::SAlignedCompartment>, ii, acs) {
-
- for(size_t i = 0, seg_dim = ii->m_segments.size(); i < seg_dim; ++i) {
- const CSplign::SSegment& seg = ii->m_segments[i];
- oss << ii->m_id << 't' << m_QueryId << 't' << m_SubjId << 't';
- if(seg.m_exon) {
- oss << seg.m_idty << 't';
- }
- else {
- oss << "-t";
- }
-
- oss << seg.m_len << 't';
-
- if(ii->m_QueryStrand) {
- oss << ii->m_qmin + seg.m_box[0] + 1 << 't'
- << ii->m_qmin + seg.m_box[1] + 1 << 't';
- }
- else {
- oss << ii->m_mrnasize - seg.m_box[0] << 't'
- << ii->m_mrnasize - seg.m_box[1] << 't';
- }
-
- if(seg.m_exon) {
-
- if(ii->m_SubjStrand) {
- oss << ii->m_smin + seg.m_box[2] + 1 << 't'
- << ii->m_smin + seg.m_box[3] + 1 << 't';
- }
- else {
- oss << ii->m_smax - seg.m_box[2] + 1 << 't'
- << ii->m_smax - seg.m_box[3] + 1 << 't';
- }
- }
- else {
- oss << "-t-t";
- }
-
- if(seg.m_exon) {
-
- oss << seg.m_annot << 't';
- oss << RLE(seg.m_details);
- #ifdef GENOME_PIPELINE
- oss << 't' << ScoreByTranscript(*(m_splign->GetAligner()),
- seg.m_details.c_str());
- #endif
- }
- else {
- if(i == 0) {
- oss << "<L-Gap>t";
- }
- else if(i == seg_dim - 1) {
- oss << "<R-Gap>t";
- }
- else {
- oss << "<M-Gap>t";
- }
- oss << '-';
- #ifdef GENOME_PIPELINE
- oss << "t-";
- #endif
- }
- oss << endl;
- }
- }
- return CNcbiOstrstreamToString(oss);
- }
- CRef<CSeq_align_set> CSplignFormatter::AsSeqAlignSet(void) const
- {
- const vector<CSplign::SAlignedCompartment>& acs = m_splign->GetResult();
- CRef<CSeq_align_set> rv (new CSeq_align_set);
- CSeq_align_set::Tdata& data = rv->Set();
- ITERATE(vector<CSplign::SAlignedCompartment>, ii, acs) {
-
- vector<size_t> boxes;
- vector<string> transcripts;
- vector<CNWAligner::TScore> scores;
- for(size_t i = 0, seg_dim = ii->m_segments.size(); i < seg_dim; ++i) {
- const CSplign::SSegment& seg = ii->m_segments[i];
- if(seg.m_exon) {
-
- if(ii->m_QueryStrand) {
- boxes.push_back(ii->m_qmin + seg.m_box[0]);
- boxes.push_back(ii->m_qmin + seg.m_box[1]);
- }
- else {
- boxes.push_back(ii->m_mrnasize - seg.m_box[0] - 1);
- boxes.push_back(ii->m_mrnasize - seg.m_box[1] - 1);
- }
- if(ii->m_SubjStrand) {
- boxes.push_back(ii->m_smin + seg.m_box[2]);
- boxes.push_back(ii->m_smin + seg.m_box[3]);
- }
- else {
- boxes.push_back(ii->m_smax - seg.m_box[2]);
- boxes.push_back(ii->m_smax - seg.m_box[3]);
- }
- transcripts.push_back(seg.m_details);
- scores.push_back(ScoreByTranscript(*(m_splign->GetAligner()),
- seg.m_details.c_str()));
- }
- }
-
- CRef<CSeq_align> sa (x_Compartment2SeqAlign(boxes,transcripts,scores));
- data.push_back(sa);
- }
-
- return rv;
- }
- CRef<CSeq_align> CSplignFormatter::x_Compartment2SeqAlign (
- const vector<size_t>& boxes,
- const vector<string>& transcripts,
- const vector<int>& scores ) const
- {
- const size_t num_exons = boxes.size() / 4;
- CRef<CSeq_align> sa (new CSeq_align);
- sa->Reset();
- // this is a discontinuous alignment
- sa->SetType(CSeq_align::eType_disc);
- sa->SetDim(2);
-
- // seq-ids
- CRef<CSeq_id> id1 ( new CSeq_id (m_QueryId) );
- if(id1->Which()==CSeq_id::e_not_set) {
- id1.Reset(new CSeq_id(CSeq_id::e_Local, m_QueryId, kEmptyStr));
- }
-
- CRef<CSeq_id> id2 ( new CSeq_id (m_SubjId) );
- if(id2->Which()==CSeq_id::e_not_set) {
- id2.Reset(new CSeq_id(CSeq_id::e_Local, m_SubjId, kEmptyStr));
- }
- // create seq-align-set
- CRef< CSeq_align::C_Segs > segs (new CSeq_align::C_Segs);
- CSeq_align_set& sas = segs->SetDisc();
- list<CRef<CSeq_align> >& sas_data = sas.Set();
-
- for(size_t i = 0; i < num_exons; ++i) {
- CRef<CSeq_align> sa (new CSeq_align);
- sa->Reset();
- sa->SetDim(2);
- sa->SetType(CSeq_align::eType_global);
- CRef<CScore> score (new CScore);
- CRef<CObject_id> id (new CObject_id);
- id->SetStr("splign");
- score->SetId(*id);
- CRef< CScore::C_Value > val (new CScore::C_Value);
- val->SetInt(scores[i]);
- score->SetValue(*val);
- CSeq_align::TScore& scorelist = sa->SetScore();
- scorelist.push_back(score);
- CRef<CSeq_align::C_Segs> segs (new CSeq_align::C_Segs);
- CDense_seg& ds = segs->SetDenseg();
- ds.SetNumseg(0);
- ds.SetDim(2);
- vector< CRef< CSeq_id > > &ids = ds.SetIds();
- ids.push_back(id1);
- ids.push_back(id2);
- const size_t* box = &(*(boxes.begin() + i*4));
- x_Exon2DS(box, transcripts[i], &ds);
- sa->SetSegs(*segs);
- sas_data.push_back(sa);
- }
-
- sa->SetSegs(*segs);
- return sa;
- }
- void CSplignFormatter::x_Exon2DS(const size_t* box, const string& trans,
- CDense_seg* pds ) const
- {
- bool strand = box[2] <= box[3];
- CDense_seg& ds = *pds;
- vector< TSignedSeqPos > &starts = ds.SetStarts();
- vector< TSeqPos > &lens = ds.SetLens();
- vector< ENa_strand > &strands = ds.SetStrands();
-
- // iterate through the transcript
- size_t seg_count = 0;
- size_t start1 = 0, pos1 = 0; // relative to exon start in mrna
- size_t start2 = 0, pos2 = 0; // and genomic
- size_t seg_len = 0;
-
- string::const_iterator ib = trans.begin(), ie = trans.end(), ii = ib;
- unsigned char seg_type;
- char c = *ii++;
- if(c == 'M' || c == 'R') {
- seg_type = 0; // matches and mismatches
- ++pos1;
- ++pos2;
- }
- else if (c == 'I') {
- seg_type = 1; // inserts
- ++pos2;
- }
- else {
- seg_type = 2; // dels
- ++pos1;
- }
-
- while(ii < ie) {
- c = *ii;
- if(isalpha(c)) {
- if(seg_type == 0 && (c == 'M' || c == 'R')) {
- ++pos1;
- ++pos2;
- ++ii;
- }
- else {
- // close current seg
- starts.push_back((seg_type == 1)? -1: TSignedSeqPos(box[0]+start1));
- starts.push_back((seg_type == 2)? -1:
- TSignedSeqPos(box[2]+(strand?start2:(1-pos2))));
- strands.push_back(eNa_strand_plus);
- strands.push_back(strand? eNa_strand_plus: eNa_strand_minus);
- switch(seg_type) {
- case 0: seg_len = pos1 - start1; break;
- case 1: seg_len = pos2 - start2; break;
- case 2: seg_len = pos1 - start1; break;
- }
- lens.push_back(seg_len);
- ++seg_count;
-
- // start a new seg
- start1 = pos1;
- start2 = pos2;
- if(c == 'M' || c == 'R') {
- seg_type = 0; // matches and mismatches
- ++pos1;
- ++pos2;
- }
- else if (c == 'I') {
- seg_type = 1; // inserts
- ++pos2;
- }
- else {
- seg_type = 2; // dels
- ++pos1;
- }
- ++ii;
- }
- }
- else {
- size_t len = 0;
- while(ii < ie && ('0' <= *ii && *ii <= '9')) {
- len = 10*len + *ii - '0';
- ++ii;
- }
- --len;
- switch(seg_type) {
- case 0: pos1 += len; pos2 += len; break;
- case 1: pos2 += len; break;
- case 2: pos1 += len; break;
- }
- }
- }
-
- starts.push_back( (seg_type == 1)? -1: TSignedSeqPos(box[0] + start1) );
- starts.push_back( (seg_type == 2)? -1:
- TSignedSeqPos(box[2] + (strand? start2: (1 - pos2))) );
- strands.push_back(eNa_strand_plus);
- strands.push_back(strand? eNa_strand_plus: eNa_strand_minus);
- switch(seg_type) {
- case 0: seg_len = pos1 - start1; break;
- case 1: seg_len = pos2 - start2; break;
- case 2: seg_len = pos1 - start1; break;
- }
- lens.push_back(seg_len);
- ++seg_count;
- ds.SetNumseg(seg_count);
- }
- END_NCBI_SCOPE
- /*
- * ===========================================================================
- *
- * $Log: splign_formatter.cpp,v $
- * Revision 1000.0 2004/06/01 18:12:43 gouriano
- * PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.6
- *
- * Revision 1.6 2004/05/24 16:13:57 gorelenk
- * Added PCH ncbi_pch.hpp
- *
- * Revision 1.5 2004/05/04 15:23:45 ucko
- * Split splign code out of xalgoalign into new xalgosplign.
- *
- * Revision 1.4 2004/04/30 15:00:47 kapustin
- * Support ASN formatting
- *
- * Revision 1.3 2004/04/23 14:37:44 kapustin
- * *** empty log message ***
- *
- *
- * ===========================================================================
- */