cav_seqset.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:15k
- /*
- * ===========================================================================
- * PRODUCTION $Log: cav_seqset.cpp,v $
- * PRODUCTION Revision 1000.2 2004/06/01 19:41:21 gouriano
- * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.5
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: cav_seqset.cpp,v 1000.2 2004/06/01 19:41:21 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Authors: Paul Thiessen
- *
- * File Description:
- * Classes to hold sets of sequences
- *
- * ===========================================================================
- */
- #include <ncbi_pch.hpp>
- #include <corelib/ncbistd.hpp>
- #include <corelib/ncbistre.hpp>
- #include <objects/seq/Bioseq.hpp>
- #include <objects/seqloc/Seq_id.hpp>
- #include <objects/seqloc/PDB_seq_id.hpp>
- #include <objects/seqloc/PDB_mol_id.hpp>
- #include <objects/general/Object_id.hpp>
- #include <objects/seqset/Bioseq_set.hpp>
- #include <objects/seq/Seq_inst.hpp>
- #include <objects/seq/Seq_data.hpp>
- #include <objects/seq/NCBIeaa.hpp>
- #include <objects/seq/IUPACaa.hpp>
- #include <objects/seq/NCBI4na.hpp>
- #include <objects/seq/NCBI8na.hpp>
- #include <objects/seq/NCBI2na.hpp>
- #include <objects/seq/IUPACna.hpp>
- #include <objects/seq/NCBIstdaa.hpp>
- #include <objects/seq/Seq_annot.hpp>
- #include <objects/general/Dbtag.hpp>
- #include <objects/seqloc/Textseq_id.hpp>
- #include <objects/seq/Seq_descr.hpp>
- #include <objects/seq/Seqdesc.hpp>
- #include <objects/seqblock/PDB_block.hpp>
- #include <memory>
- #include <objtools/cddalignview/cav_seqset.hpp>
- #include <objtools/cddalignview/cddalignview.h>
- BEGIN_NCBI_SCOPE
- USING_SCOPE(objects);
- void SequenceSet::UnpackSeqSet(const CBioseq_set& bss)
- {
- CBioseq_set::TSeq_set::const_iterator q, qe = bss.GetSeq_set().end();
- for (q=bss.GetSeq_set().begin(); q!=qe; ++q) {
- if (q->GetObject().IsSeq()) {
- // only store amino acid or nucleotide sequences
- if (q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_aa &&
- q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_dna &&
- q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_rna &&
- q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_na)
- continue;
- const Sequence *sequence = new Sequence(q->GetObject().GetSeq());
- if (!sequence || sequence->Status() != CAV_SUCCESS) {
- status = sequence->Status();
- return;
- }
- sequences.push_back(sequence);
- } else { // Bioseq-set
- UnpackSeqSet(q->GetObject().GetSet());
- }
- }
- }
- void SequenceSet::UnpackSeqEntry(const CSeq_entry& seqEntry)
- {
- if (seqEntry.IsSeq()) {
- const Sequence *sequence = new Sequence(seqEntry.GetSeq());
- if (!sequence || sequence->Status() != CAV_SUCCESS) {
- status = sequence->Status();
- return;
- }
- sequences.push_back(sequence);
- } else { // Bioseq-set
- UnpackSeqSet(seqEntry.GetSet());
- }
- }
- SequenceSet::SequenceSet(const CSeq_entry& seqEntry) :
- master(NULL), status(CAV_SUCCESS)
- {
- UnpackSeqEntry(seqEntry);
- ERR_POST(Info << "number of sequences: " << sequences.size());
- }
- SequenceSet::SequenceSet(const SeqEntryList& seqEntries) :
- master(NULL), status(CAV_SUCCESS)
- {
- SeqEntryList::const_iterator s, se = seqEntries.end();
- for (s=seqEntries.begin(); s!=se; ++s)
- UnpackSeqEntry(s->GetObject());
- ERR_POST(Info << "number of sequences: " << sequences.size());
- }
- SequenceSet::~SequenceSet(void)
- {
- SequenceList::iterator s, se = sequences.end();
- for (s=sequences.begin(); s!=se; ++s) delete *s;
- }
- const int Sequence::NOT_SET = -1;
- #define FIRSTOF2(byte) (((byte) & 0xF0) >> 4)
- #define SECONDOF2(byte) ((byte) & 0x0F)
- static void StringFrom4na(const vector< char >& vec, string *str, bool isDNA)
- {
- if (SECONDOF2(vec.back()) > 0)
- str->resize(vec.size() * 2);
- else
- str->resize(vec.size() * 2 - 1);
- // first, extract 4-bit values
- int i;
- for (i=0; i<vec.size(); ++i) {
- str->at(2*i) = FIRSTOF2(vec[i]);
- if (SECONDOF2(vec[i]) > 0) str->at(2*i + 1) = SECONDOF2(vec[i]);
- }
- // then convert 4-bit values to ascii characters
- for (i=0; i<str->size(); ++i) {
- switch (str->at(i)) {
- case 1: str->at(i) = 'A'; break;
- case 2: str->at(i) = 'C'; break;
- case 4: str->at(i) = 'G'; break;
- case 8: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;
- default:
- str->at(i) = 'X';
- }
- }
- }
- #define FIRSTOF4(byte) (((byte) & 0xC0) >> 6)
- #define SECONDOF4(byte) (((byte) & 0x30) >> 4)
- #define THIRDOF4(byte) (((byte) & 0x0C) >> 2)
- #define FOURTHOF4(byte) ((byte) & 0x03)
- static void StringFrom2na(const vector< char >& vec, string *str, bool isDNA)
- {
- str->resize(vec.size() * 4);
- // first, extract 4-bit values
- int i;
- for (i=0; i<vec.size(); ++i) {
- str->at(4*i) = FIRSTOF4(vec[i]);
- str->at(4*i + 1) = SECONDOF4(vec[i]);
- str->at(4*i + 2) = THIRDOF4(vec[i]);
- str->at(4*i + 3) = FOURTHOF4(vec[i]);
- }
- // then convert 4-bit values to ascii characters
- for (i=0; i<str->size(); ++i) {
- switch (str->at(i)) {
- case 0: str->at(i) = 'A'; break;
- case 1: str->at(i) = 'C'; break;
- case 2: str->at(i) = 'G'; break;
- case 3: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;
- }
- }
- }
- static void StringFromStdaa(const vector < char >& vec, std::string *str)
- {
- static const char *stdaaMap = "-ABCDEFGHIKLMNPQRSTVWXYZU*";
- str->resize(vec.size());
- for (int i=0; i<vec.size(); ++i)
- str->at(i) = stdaaMap[vec[i]];
- }
- Sequence::Sequence(const CBioseq& bioseq) :
- gi(NOT_SET), pdbChain(' '), mmdbLink(NOT_SET), status(CAV_ERROR_SEQUENCES),
- bioseqASN(&bioseq)
- {
- // get Seq-id info
- CBioseq::TId::const_iterator s, se = bioseq.GetId().end();
- for (s=bioseq.GetId().begin(); s!=se; ++s) {
- if (s->GetObject().IsGi()) {
- gi = s->GetObject().GetGi();
- } else if (s->GetObject().IsPdb()) {
- pdbID = s->GetObject().GetPdb().GetMol().Get();
- if (s->GetObject().GetPdb().IsSetChain())
- pdbChain = s->GetObject().GetPdb().GetChain();
- } else if (s->GetObject().IsLocal() && s->GetObject().GetLocal().IsStr()) {
- pdbID = s->GetObject().GetLocal().GetStr();
- } else if (s->GetObject().IsGenbank() && s->GetObject().GetGenbank().IsSetAccession()) {
- accession = s->GetObject().GetGenbank().GetAccession();
- } else if (s->GetObject().IsSwissprot() && s->GetObject().GetSwissprot().IsSetAccession()) {
- accession = s->GetObject().GetSwissprot().GetAccession();
- }
- }
- if (gi == NOT_SET && pdbID.size() == 0 && accession.size() == 0) {
- ERR_POST(Error << "Sequence::Sequence() - can't parse SeqId");
- return;
- }
- // try to get description from title or compound
- if (bioseq.IsSetDescr()) {
- CSeq_descr::Tdata::const_iterator d, de = bioseq.GetDescr().Get().end();
- for (d=bioseq.GetDescr().Get().begin(); d!=de; ++d) {
- if (d->GetObject().IsTitle()) {
- description = d->GetObject().GetTitle();
- break;
- } else if (d->GetObject().IsPdb() && d->GetObject().GetPdb().GetCompound().size() > 0) {
- description = d->GetObject().GetPdb().GetCompound().front();
- break;
- }
- }
- }
- // get link to MMDB id - mainly for CDD's where Biostrucs have to be loaded separately
- if (bioseq.IsSetAnnot()) {
- CBioseq::TAnnot::const_iterator a, ae = bioseq.GetAnnot().end();
- for (a=bioseq.GetAnnot().begin(); a!=ae; ++a) {
- if (a->GetObject().GetData().IsIds()) {
- CSeq_annot::C_Data::TIds::const_iterator i, ie = a->GetObject().GetData().GetIds().end();
- for (i=a->GetObject().GetData().GetIds().begin(); i!=ie; ++i) {
- if (i->GetObject().IsGeneral() &&
- i->GetObject().GetGeneral().GetDb() == "mmdb" &&
- i->GetObject().GetGeneral().GetTag().IsId()) {
- mmdbLink = i->GetObject().GetGeneral().GetTag().GetId();
- break;
- }
- }
- if (i != ie) break;
- }
- }
- }
- if (mmdbLink != NOT_SET)
- ERR_POST(Info << "sequence gi " << gi << ", PDB '" << pdbID << "' chain '" << (char) pdbChain <<
- "', is from MMDB id " << mmdbLink);
- // get sequence string
- if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_raw && bioseq.GetInst().IsSetSeq_data()) {
- // protein formats
- if (bioseq.GetInst().GetSeq_data().IsNcbieaa()) {
- sequenceString = bioseq.GetInst().GetSeq_data().GetNcbieaa().Get();
- } else if (bioseq.GetInst().GetSeq_data().IsIupacaa()) {
- sequenceString = bioseq.GetInst().GetSeq_data().GetIupacaa().Get();
- } else if (bioseq.GetInst().GetSeq_data().IsNcbistdaa()) {
- StringFromStdaa(bioseq.GetInst().GetSeq_data().GetNcbistdaa().Get(), &sequenceString);
- }
- // nucleotide formats
- else if (bioseq.GetInst().GetSeq_data().IsIupacna()) {
- sequenceString = bioseq.GetInst().GetSeq_data().GetIupacna().Get();
- } else if (bioseq.GetInst().GetSeq_data().IsNcbi4na()) {
- StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi4na().Get(), &sequenceString,
- (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));
- } else if (bioseq.GetInst().GetSeq_data().IsNcbi8na()) { // same repr. for non-X as 4na
- StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi8na().Get(), &sequenceString,
- (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));
- } else if (bioseq.GetInst().GetSeq_data().IsNcbi2na()) {
- StringFrom2na(bioseq.GetInst().GetSeq_data().GetNcbi2na().Get(), &sequenceString,
- (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));
- if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() < sequenceString.length())
- sequenceString.resize(bioseq.GetInst().GetLength());
- }
- else {
- ERR_POST(Critical << "Sequence::Sequence() - sequence " << gi
- << ": confused by sequence string format");
- return;
- }
- if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() != sequenceString.length()) {
- ERR_POST(Critical << "Sequence::Sequence() - sequence string length mismatch");
- return;
- }
- } else {
- ERR_POST(Critical << "Sequence::Sequence() - sequence " << gi
- << ": confused by sequence representation");
- return;
- }
- status = CAV_SUCCESS;
- }
- CSeq_id * Sequence::CreateSeqId(void) const
- {
- CSeq_id *sid = new CSeq_id();
- if (pdbID.size() > 0) {
- sid->SetPdb().SetMol().Set(pdbID);
- if (pdbChain != ' ') sid->SetPdb().SetChain(pdbChain);
- } else { // use gi
- sid->SetGi(gi);
- }
- return sid;
- }
- string Sequence::GetTitle(void) const
- {
- CNcbiOstrstream oss;
- if (pdbID.size() > 0) {
- oss << pdbID;
- if (pdbChain != ' ') {
- oss << '_' << (char) pdbChain;
- }
- } else if (gi != NOT_SET)
- oss << "gi " << gi;
- else if (accession.size() > 0)
- oss << "acc " << accession;
- else
- oss << '?';
- oss << ' ';
- auto_ptr<char> chars(oss.str()); // deletes memory upon function return
- return string(oss.str());
- }
- END_NCBI_SCOPE
- /*
- * ---------------------------------------------------------------------------
- * $Log: cav_seqset.cpp,v $
- * Revision 1000.2 2004/06/01 19:41:21 gouriano
- * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.5
- *
- * Revision 1.5 2004/05/21 21:42:51 gorelenk
- * Added PCH ncbi_pch.hpp
- *
- * Revision 1.4 2004/05/08 10:56:20 thiessen
- * better handling of repeated sequences
- *
- * Revision 1.3 2004/03/15 18:51:27 thiessen
- * prefer prefix vs. postfix ++/-- operators
- *
- * Revision 1.2 2003/06/02 16:06:41 dicuccio
- * Rearranged src/objects/ subtree. This includes the following shifts:
- * - src/objects/asn2asn --> arc/app/asn2asn
- * - src/objects/testmedline --> src/objects/ncbimime/test
- * - src/objects/objmgr --> src/objmgr
- * - src/objects/util --> src/objmgr/util
- * - src/objects/alnmgr --> src/objtools/alnmgr
- * - src/objects/flat --> src/objtools/flat
- * - src/objects/validator --> src/objtools/validator
- * - src/objects/cddalignview --> src/objtools/cddalignview
- * In addition, libseq now includes six of the objects/seq... libs, and libmmdb
- * replaces the three libmmdb? libs.
- *
- * Revision 1.1 2003/03/19 19:04:12 thiessen
- * move again
- *
- * Revision 1.2 2003/03/19 16:06:28 thiessen
- * add <memory>
- *
- * Revision 1.1 2003/03/19 05:33:43 thiessen
- * move to src/app/cddalignview
- *
- * Revision 1.9 2003/02/03 17:52:03 thiessen
- * move CVS Log to end of file
- *
- * Revision 1.8 2002/05/28 17:55:48 thiessen
- * cavpr
- *
- * Revision 1.7 2001/07/12 21:32:13 thiessen
- * update for swissprot accessions
- *
- * Revision 1.6 2001/01/29 18:13:34 thiessen
- * split into C-callable library + main
- *
- * Revision 1.5 2001/01/25 00:51:20 thiessen
- * add command-line args; can read asn data from stdin
- *
- * Revision 1.4 2001/01/23 20:42:01 thiessen
- * add description
- *
- * Revision 1.3 2001/01/23 17:34:12 thiessen
- * add HTML output
- *
- * Revision 1.2 2001/01/22 15:55:11 thiessen
- * correctly set up ncbi namespacing
- *
- * Revision 1.1 2001/01/22 13:15:24 thiessen
- * initial checkin
- *
- */