su_sequence_set.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:13k
- /*
- * ===========================================================================
- * PRODUCTION $Log: su_sequence_set.cpp,v $
- * PRODUCTION Revision 1000.0 2004/06/01 18:14:43 gouriano
- * PRODUCTION PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.3
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: su_sequence_set.cpp,v 1000.0 2004/06/01 18:14:43 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Authors: Paul Thiessen
- *
- * File Description:
- * Classes to hold sets of sequences
- *
- * ===========================================================================
- */
- #include <ncbi_pch.hpp>
- #include <corelib/ncbistd.hpp>
- #include <corelib/ncbistre.hpp>
- #include <corelib/ncbistl.hpp>
- #include <vector>
- #include <map>
- #include <objects/seqloc/Seq_id.hpp>
- #include <objects/seqloc/PDB_seq_id.hpp>
- #include <objects/seqloc/PDB_mol_id.hpp>
- #include <objects/general/Object_id.hpp>
- #include <objects/seqset/Bioseq_set.hpp>
- #include <objects/seq/Seq_inst.hpp>
- #include <objects/seq/Seq_data.hpp>
- #include <objects/seq/NCBIeaa.hpp>
- #include <objects/seq/IUPACaa.hpp>
- #include <objects/seq/NCBIstdaa.hpp>
- #include <objects/seq/NCBI4na.hpp>
- #include <objects/seq/NCBI8na.hpp>
- #include <objects/seq/NCBI2na.hpp>
- #include <objects/seq/IUPACna.hpp>
- #include <objects/seq/Seq_annot.hpp>
- #include <objects/general/Dbtag.hpp>
- #include <objects/seqloc/Textseq_id.hpp>
- #include <objects/seq/Seq_descr.hpp>
- #include <objects/seq/Seqdesc.hpp>
- #include <objects/seqblock/PDB_block.hpp>
- #include <objects/seqfeat/BioSource.hpp>
- #include <objects/seqfeat/Org_ref.hpp>
- #include "su_sequence_set.hpp"
- #include "su_private.hpp"
- // borrow Cn3D's asn conversion functions
- #include "../src/app/cn3d/asn_converter.hpp"
- // C-toolkit stuff for BioseqPtr creation
- #include <objseq.h>
- #include <objalign.h>
- USING_NCBI_SCOPE;
- USING_SCOPE(objects);
- BEGIN_SCOPE(struct_util)
- // holds C Bioseqs associated with Sequences
- typedef map < const Sequence *, Bioseq * > BioseqMap;
- static BioseqMap bioseqs;
- void AddCSeqId(SeqIdPtr *sid, const ncbi::objects::CSeq_id& cppid)
- {
- string err;
- ValNode *vn = (SeqIdPtr) Cn3D::ConvertAsnFromCPPToC(cppid, (AsnReadFunc) SeqIdAsnRead, &err);
- if (!vn || err.size() > 0) {
- ERROR_MESSAGE("AddCSeqId() - ConvertAsnFromCPPToC() failed");
- return;
- }
- ValNodeLink(sid, vn);
- }
- static void AddCSeqIdAll(SeqIdPtr *id, const Sequence& sequence)
- {
- CBioseq::TId::const_iterator i, ie = sequence.GetAllIdentifiers().end();
- for (i=sequence.GetAllIdentifiers().begin(); i!=ie; ++i)
- AddCSeqId(id, **i);
- }
- BioseqPtr GetOrCreateBioseq(const Sequence *sequence)
- {
- if (!sequence || !sequence->m_isProtein) {
- ERROR_MESSAGE("GetOrCreateBioseq() - got non-protein or NULL Sequence");
- return NULL;
- }
- // if already done
- BioseqMap::const_iterator b = bioseqs.find(sequence);
- if (b != bioseqs.end())
- return b->second;
- // create new Bioseq and fill it in from Sequence data
- BioseqPtr bioseq = BioseqNew();
- bioseq->mol = Seq_mol_aa;
- bioseq->seq_data_type = Seq_code_ncbieaa;
- bioseq->repr = Seq_repr_raw;
- bioseq->length = sequence->Length();
- bioseq->seq_data = BSNew(bioseq->length);
- BSWrite(bioseq->seq_data, const_cast<char*>(sequence->m_sequenceString.c_str()), bioseq->length);
- // create Seq-id
- AddCSeqIdAll(&(bioseq->id), *sequence);
- // store Bioseq
- bioseqs[sequence] = bioseq;
- return bioseq;
- }
- static void UnpackSeqSet(CBioseq_set& bss, SequenceSet::SequenceList& seqlist)
- {
- CBioseq_set::TSeq_set::iterator q, qe = bss.SetSeq_set().end();
- for (q=bss.SetSeq_set().begin(); q!=qe; ++q) {
- if (q->GetObject().IsSeq()) {
- // only store amino acid or nucleotide sequences
- if (q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_aa &&
- q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_dna &&
- q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_rna &&
- q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_na)
- continue;
- CRef < Sequence > sequence(new Sequence(q->GetObject().SetSeq()));
- seqlist.push_back(sequence);
- } else { // Bioseq-set
- UnpackSeqSet(q->GetObject().SetSet(), seqlist);
- }
- }
- }
- static void UnpackSeqEntry(CSeq_entry& seqEntry, SequenceSet::SequenceList& seqlist)
- {
- if (seqEntry.IsSeq()) {
- CRef < Sequence > sequence(new Sequence(seqEntry.SetSeq()));
- seqlist.push_back(sequence);
- } else { // Bioseq-set
- UnpackSeqSet(seqEntry.SetSet(), seqlist);
- }
- }
- SequenceSet::SequenceSet(SeqEntryList& seqEntries)
- {
- SeqEntryList::iterator s, se = seqEntries.end();
- for (s=seqEntries.begin(); s!=se; ++s)
- UnpackSeqEntry(s->GetObject(), m_sequences);
- TRACE_MESSAGE("number of sequences: " << m_sequences.size());
- }
- #define FIRSTOF2(byte) (((byte) & 0xF0) >> 4)
- #define SECONDOF2(byte) ((byte) & 0x0F)
- static void StringFrom4na(const vector< char >& vec, string *str, bool isDNA)
- {
- if (SECONDOF2(vec.back()) > 0)
- str->resize(vec.size() * 2);
- else
- str->resize(vec.size() * 2 - 1);
- // first, extract 4-bit values
- unsigned int i;
- for (i=0; i<vec.size(); ++i) {
- str->at(2*i) = FIRSTOF2(vec[i]);
- if (SECONDOF2(vec[i]) > 0) str->at(2*i + 1) = SECONDOF2(vec[i]);
- }
- // then convert 4-bit values to ascii characters
- for (i=0; i<str->size(); ++i) {
- switch (str->at(i)) {
- case 1: str->at(i) = 'A'; break;
- case 2: str->at(i) = 'C'; break;
- case 4: str->at(i) = 'G'; break;
- case 8: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;
- default:
- str->at(i) = 'X';
- }
- }
- }
- #define FIRSTOF4(byte) (((byte) & 0xC0) >> 6)
- #define SECONDOF4(byte) (((byte) & 0x30) >> 4)
- #define THIRDOF4(byte) (((byte) & 0x0C) >> 2)
- #define FOURTHOF4(byte) ((byte) & 0x03)
- static void StringFrom2na(const vector< char >& vec, string *str, bool isDNA)
- {
- str->resize(vec.size() * 4);
- // first, extract 4-bit values
- unsigned int i;
- for (i=0; i<vec.size(); ++i) {
- str->at(4*i) = FIRSTOF4(vec[i]);
- str->at(4*i + 1) = SECONDOF4(vec[i]);
- str->at(4*i + 2) = THIRDOF4(vec[i]);
- str->at(4*i + 3) = FOURTHOF4(vec[i]);
- }
- // then convert 4-bit values to ascii characters
- for (i=0; i<str->size(); ++i) {
- switch (str->at(i)) {
- case 0: str->at(i) = 'A'; break;
- case 1: str->at(i) = 'C'; break;
- case 2: str->at(i) = 'G'; break;
- case 3: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;
- }
- }
- }
- static void StringFromStdaa(const vector < char >& vec, string *str)
- {
- static const char *stdaaMap = "-ABCDEFGHIKLMNPQRSTVWXYZU*";
- str->resize(vec.size());
- for (unsigned int i=0; i<vec.size(); ++i)
- str->at(i) = stdaaMap[vec[i]];
- }
- Sequence::Sequence(ncbi::objects::CBioseq& bioseq) :
- m_bioseqASN(&bioseq), m_isProtein(false)
- {
- // fill out description
- if (bioseq.IsSetDescr()) {
- string defline, taxid;
- CSeq_descr::Tdata::const_iterator d, de = bioseq.GetDescr().Get().end();
- for (d=bioseq.GetDescr().Get().begin(); d!=de; ++d) {
- // get "defline" from title or compound
- if ((*d)->IsTitle()) { // prefer title over compound
- defline = (*d)->GetTitle();
- } else if (defline.size() == 0 && (*d)->IsPdb() && (*d)->GetPdb().GetCompound().size() > 0) {
- defline = (*d)->GetPdb().GetCompound().front();
- }
- // get taxonomy
- if ((*d)->IsSource()) {
- if ((*d)->GetSource().GetOrg().IsSetTaxname())
- taxid = (*d)->GetSource().GetOrg().GetTaxname();
- else if ((*d)->GetSource().GetOrg().IsSetCommon())
- taxid = (*d)->GetSource().GetOrg().GetCommon();
- }
- }
- if (taxid.size() > 0)
- m_description = string("[") + taxid + ']';
- if (defline.size() > 0) {
- if (taxid.size() > 0)
- m_description += ' ';
- m_description += defline;
- }
- }
- // get sequence string
- if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_raw && bioseq.GetInst().IsSetSeq_data()) {
- // protein formats
- if (bioseq.GetInst().GetSeq_data().IsNcbieaa()) {
- m_sequenceString = bioseq.GetInst().GetSeq_data().GetNcbieaa().Get();
- m_isProtein = true;
- } else if (bioseq.GetInst().GetSeq_data().IsIupacaa()) {
- m_sequenceString = bioseq.GetInst().GetSeq_data().GetIupacaa().Get();
- m_isProtein = true;
- } else if (bioseq.GetInst().GetSeq_data().IsNcbistdaa()) {
- StringFromStdaa(bioseq.GetInst().GetSeq_data().GetNcbistdaa().Get(), &m_sequenceString);
- m_isProtein = true;
- }
- // nucleotide formats
- else if (bioseq.GetInst().GetSeq_data().IsIupacna()) {
- m_sequenceString = bioseq.GetInst().GetSeq_data().GetIupacna().Get();
- // convert 'T' to 'U' for RNA
- if (bioseq.GetInst().GetMol() == CSeq_inst::eMol_rna) {
- for (unsigned int i=0; i<m_sequenceString.size(); ++i) {
- if (m_sequenceString[i] == 'T')
- m_sequenceString[i] = 'U';
- }
- }
- } else if (bioseq.GetInst().GetSeq_data().IsNcbi4na()) {
- StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi4na().Get(), &m_sequenceString,
- (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));
- } else if (bioseq.GetInst().GetSeq_data().IsNcbi8na()) { // same repr. for non-X as 4na
- StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi8na().Get(), &m_sequenceString,
- (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));
- } else if (bioseq.GetInst().GetSeq_data().IsNcbi2na()) {
- StringFrom2na(bioseq.GetInst().GetSeq_data().GetNcbi2na().Get(), &m_sequenceString,
- (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));
- if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() < m_sequenceString.length())
- m_sequenceString.resize(bioseq.GetInst().GetLength());
- }
- else
- THROW_MESSAGE("Sequence::Sequence(): confused by sequence format");
- // check length
- if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() != m_sequenceString.length())
- THROW_MESSAGE("Sequence::Sequence() - sequence string length mismatch");
- // force uppercase
- for (unsigned int i=0; i<m_sequenceString.size(); ++i)
- m_sequenceString[i] = toupper(m_sequenceString[i]);
- } else
- THROW_MESSAGE("Sequence::Sequence(): confused by sequence representation");
- }
- Sequence::~Sequence(void)
- {
- BioseqMap::iterator b = bioseqs.find(this);
- if (b != bioseqs.end()) {
- BioseqFree(b->second);
- bioseqs.erase(b);
- }
- }
- #define RETURN_FIRST_SEQID_THAT_(is)
- for (i=m_bioseqASN->GetId().begin(); i!=ie; ++i)
- if ((*i)->is())
- return **i
- const CSeq_id& Sequence::GetPreferredIdentifier(void) const
- {
- CBioseq::TId::const_iterator i, ie = m_bioseqASN->GetId().end();
- // try to find one of these first
- RETURN_FIRST_SEQID_THAT_(IsPdb);
- RETURN_FIRST_SEQID_THAT_(IsGi);
- // otherwise, just use the first one
- return m_bioseqASN->GetId().front().GetObject();
- }
- bool Sequence::MatchesSeqId(const CSeq_id& seqID) const
- {
- CBioseq::TId::const_iterator i, ie = m_bioseqASN->GetId().end();
- for (i=m_bioseqASN->GetId().begin(); i!=ie; ++i) {
- if (seqID.Match(**i))
- return true;
- }
- return false;
- }
- string Sequence::IdentifierString(void) const
- {
- return CSeq_id::GetStringDescr(*m_bioseqASN, CSeq_id::eFormat_FastA);
- }
- END_SCOPE(struct_util)
- /*
- * ---------------------------------------------------------------------------
- * $Log: su_sequence_set.cpp,v $
- * Revision 1000.0 2004/06/01 18:14:43 gouriano
- * PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.3
- *
- * Revision 1.3 2004/05/28 09:46:57 thiessen
- * restructure C-toolkit header usage ; move C Bioseq storage into su_sequence_set
- *
- * Revision 1.2 2004/05/25 15:52:18 thiessen
- * add BlockMultipleAlignment, IBM algorithm
- *
- * Revision 1.1 2004/05/24 23:04:05 thiessen
- * initial checkin
- *
- */