生物技术

开发平台：
C/C++

su_sequence_set.cpp：源码内容
							/*
 * ===========================================================================
 * PRODUCTION $Log: su_sequence_set.cpp,v $
 * PRODUCTION Revision 1000.0  2004/06/01 18:14:43  gouriano
 * PRODUCTION PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.3
 * PRODUCTION
 * ===========================================================================
 */
/*  $Id: su_sequence_set.cpp,v 1000.0 2004/06/01 18:14:43 gouriano Exp $
* ===========================================================================
*
*                            PUBLIC DOMAIN NOTICE
*               National Center for Biotechnology Information
*
*  This software/database is a "United States Government Work" under the
*  terms of the United States Copyright Act.  It was written as part of
*  the author's official duties as a United States Government employee and
*  thus cannot be copyrighted.  This software/database is freely available
*  to the public for use. The National Library of Medicine and the U.S.
*  Government have not placed any restriction on its use or reproduction.
*
*  Although all reasonable efforts have been taken to ensure the accuracy
*  and reliability of the software and data, the NLM and the U.S.
*  Government do not and cannot warrant the performance or results that
*  may be obtained by using this software or data. The NLM and the U.S.
*  Government disclaim all warranties, express or implied, including
*  warranties of performance, merchantability or fitness for any particular
*  purpose.
*
*  Please cite the author in any work or product based on this material.
*
* ===========================================================================
*
* Authors:  Paul Thiessen
*
* File Description:
*      Classes to hold sets of sequences
*
* ===========================================================================
*/
#include <ncbi_pch.hpp>
#include <corelib/ncbistd.hpp>
#include <corelib/ncbistre.hpp>
#include <corelib/ncbistl.hpp>
#include <vector>
#include <map>
#include <objects/seqloc/Seq_id.hpp>
#include <objects/seqloc/PDB_seq_id.hpp>
#include <objects/seqloc/PDB_mol_id.hpp>
#include <objects/general/Object_id.hpp>
#include <objects/seqset/Bioseq_set.hpp>
#include <objects/seq/Seq_inst.hpp>
#include <objects/seq/Seq_data.hpp>
#include <objects/seq/NCBIeaa.hpp>
#include <objects/seq/IUPACaa.hpp>
#include <objects/seq/NCBIstdaa.hpp>
#include <objects/seq/NCBI4na.hpp>
#include <objects/seq/NCBI8na.hpp>
#include <objects/seq/NCBI2na.hpp>
#include <objects/seq/IUPACna.hpp>
#include <objects/seq/Seq_annot.hpp>
#include <objects/general/Dbtag.hpp>
#include <objects/seqloc/Textseq_id.hpp>
#include <objects/seq/Seq_descr.hpp>
#include <objects/seq/Seqdesc.hpp>
#include <objects/seqblock/PDB_block.hpp>
#include <objects/seqfeat/BioSource.hpp>
#include <objects/seqfeat/Org_ref.hpp>
#include "su_sequence_set.hpp"
#include "su_private.hpp"
// borrow Cn3D's asn conversion functions
#include "../src/app/cn3d/asn_converter.hpp"
// C-toolkit stuff for BioseqPtr creation
#include <objseq.h>
#include <objalign.h>
USING_NCBI_SCOPE;
USING_SCOPE(objects);
BEGIN_SCOPE(struct_util)
// holds C Bioseqs associated with Sequences
typedef map < const Sequence *, Bioseq * > BioseqMap;
static BioseqMap bioseqs;
void AddCSeqId(SeqIdPtr *sid, const ncbi::objects::CSeq_id& cppid)
{
    string err;
    ValNode *vn = (SeqIdPtr) Cn3D::ConvertAsnFromCPPToC(cppid, (AsnReadFunc) SeqIdAsnRead, &err);
    if (!vn || err.size() > 0) {
        ERROR_MESSAGE("AddCSeqId() - ConvertAsnFromCPPToC() failed");
        return;
    }
    ValNodeLink(sid, vn);
}
static void AddCSeqIdAll(SeqIdPtr *id, const Sequence& sequence)
{
    CBioseq::TId::const_iterator i, ie = sequence.GetAllIdentifiers().end();
    for (i=sequence.GetAllIdentifiers().begin(); i!=ie; ++i)
        AddCSeqId(id, **i);
}
BioseqPtr GetOrCreateBioseq(const Sequence *sequence)
{
    if (!sequence || !sequence->m_isProtein) {
        ERROR_MESSAGE("GetOrCreateBioseq() - got non-protein or NULL Sequence");
        return NULL;
    }
    // if already done
    BioseqMap::const_iterator b = bioseqs.find(sequence);
    if (b != bioseqs.end())
        return b->second;
    // create new Bioseq and fill it in from Sequence data
    BioseqPtr bioseq = BioseqNew();
    bioseq->mol = Seq_mol_aa;
    bioseq->seq_data_type = Seq_code_ncbieaa;
    bioseq->repr = Seq_repr_raw;
    bioseq->length = sequence->Length();
    bioseq->seq_data = BSNew(bioseq->length);
    BSWrite(bioseq->seq_data, const_cast<char*>(sequence->m_sequenceString.c_str()), bioseq->length);
    // create Seq-id
    AddCSeqIdAll(&(bioseq->id), *sequence);
    // store Bioseq
    bioseqs[sequence] = bioseq;
    return bioseq;
}
static void UnpackSeqSet(CBioseq_set& bss, SequenceSet::SequenceList& seqlist)
{
    CBioseq_set::TSeq_set::iterator q, qe = bss.SetSeq_set().end();
    for (q=bss.SetSeq_set().begin(); q!=qe; ++q) {
        if (q->GetObject().IsSeq()) {
            // only store amino acid or nucleotide sequences
            if (q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_aa &&
                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_dna &&
                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_rna &&
                q->GetObject().GetSeq().GetInst().GetMol() != CSeq_inst::eMol_na)
                continue;
            CRef < Sequence > sequence(new Sequence(q->GetObject().SetSeq()));
            seqlist.push_back(sequence);
        } else { // Bioseq-set
            UnpackSeqSet(q->GetObject().SetSet(), seqlist);
        }
    }
}
static void UnpackSeqEntry(CSeq_entry& seqEntry, SequenceSet::SequenceList& seqlist)
{
    if (seqEntry.IsSeq()) {
        CRef < Sequence > sequence(new Sequence(seqEntry.SetSeq()));
        seqlist.push_back(sequence);
    } else { // Bioseq-set
        UnpackSeqSet(seqEntry.SetSet(), seqlist);
    }
}
SequenceSet::SequenceSet(SeqEntryList& seqEntries)
{
    SeqEntryList::iterator s, se = seqEntries.end();
    for (s=seqEntries.begin(); s!=se; ++s)
        UnpackSeqEntry(s->GetObject(), m_sequences);
    TRACE_MESSAGE("number of sequences: " << m_sequences.size());
}
#define FIRSTOF2(byte) (((byte) & 0xF0) >> 4)
#define SECONDOF2(byte) ((byte) & 0x0F)
static void StringFrom4na(const vector< char >& vec, string *str, bool isDNA)
{
    if (SECONDOF2(vec.back()) > 0)
        str->resize(vec.size() * 2);
    else
        str->resize(vec.size() * 2 - 1);
    // first, extract 4-bit values
    unsigned int i;
    for (i=0; i<vec.size(); ++i) {
        str->at(2*i) = FIRSTOF2(vec[i]);
        if (SECONDOF2(vec[i]) > 0) str->at(2*i + 1) = SECONDOF2(vec[i]);
    }
    // then convert 4-bit values to ascii characters
    for (i=0; i<str->size(); ++i) {
        switch (str->at(i)) {
            case 1: str->at(i) = 'A'; break;
            case 2: str->at(i) = 'C'; break;
            case 4: str->at(i) = 'G'; break;
            case 8: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;
            default:
                str->at(i) = 'X';
        }
    }
}
#define FIRSTOF4(byte) (((byte) & 0xC0) >> 6)
#define SECONDOF4(byte) (((byte) & 0x30) >> 4)
#define THIRDOF4(byte) (((byte) & 0x0C) >> 2)
#define FOURTHOF4(byte) ((byte) & 0x03)
static void StringFrom2na(const vector< char >& vec, string *str, bool isDNA)
{
    str->resize(vec.size() * 4);
    // first, extract 4-bit values
    unsigned int i;
    for (i=0; i<vec.size(); ++i) {
        str->at(4*i) = FIRSTOF4(vec[i]);
        str->at(4*i + 1) = SECONDOF4(vec[i]);
        str->at(4*i + 2) = THIRDOF4(vec[i]);
        str->at(4*i + 3) = FOURTHOF4(vec[i]);
    }
    // then convert 4-bit values to ascii characters
    for (i=0; i<str->size(); ++i) {
        switch (str->at(i)) {
            case 0: str->at(i) = 'A'; break;
            case 1: str->at(i) = 'C'; break;
            case 2: str->at(i) = 'G'; break;
            case 3: isDNA ? str->at(i) = 'T' : str->at(i) = 'U'; break;
        }
    }
}
static void StringFromStdaa(const vector < char >& vec, string *str)
{
    static const char *stdaaMap = "-ABCDEFGHIKLMNPQRSTVWXYZU*";
    str->resize(vec.size());
    for (unsigned int i=0; i<vec.size(); ++i)
        str->at(i) = stdaaMap[vec[i]];
}
Sequence::Sequence(ncbi::objects::CBioseq& bioseq) :
    m_bioseqASN(&bioseq), m_isProtein(false)
{
    // fill out description
    if (bioseq.IsSetDescr()) {
        string defline, taxid;
        CSeq_descr::Tdata::const_iterator d, de = bioseq.GetDescr().Get().end();
        for (d=bioseq.GetDescr().Get().begin(); d!=de; ++d) {
            // get "defline" from title or compound
            if ((*d)->IsTitle()) {              // prefer title over compound
                defline = (*d)->GetTitle();
            } else if (defline.size() == 0 && (*d)->IsPdb() && (*d)->GetPdb().GetCompound().size() > 0) {
                defline = (*d)->GetPdb().GetCompound().front();
            }
            // get taxonomy
            if ((*d)->IsSource()) {
                if ((*d)->GetSource().GetOrg().IsSetTaxname())
                    taxid = (*d)->GetSource().GetOrg().GetTaxname();
                else if ((*d)->GetSource().GetOrg().IsSetCommon())
                    taxid = (*d)->GetSource().GetOrg().GetCommon();
            }
        }
        if (taxid.size() > 0)
            m_description = string("[") + taxid + ']';
        if (defline.size() > 0) {
            if (taxid.size() > 0)
                m_description += ' ';
            m_description += defline;
        }
    }
    // get sequence string
    if (bioseq.GetInst().GetRepr() == CSeq_inst::eRepr_raw && bioseq.GetInst().IsSetSeq_data()) {
        // protein formats
        if (bioseq.GetInst().GetSeq_data().IsNcbieaa()) {
            m_sequenceString = bioseq.GetInst().GetSeq_data().GetNcbieaa().Get();
            m_isProtein = true;
        } else if (bioseq.GetInst().GetSeq_data().IsIupacaa()) {
            m_sequenceString = bioseq.GetInst().GetSeq_data().GetIupacaa().Get();
            m_isProtein = true;
        } else if (bioseq.GetInst().GetSeq_data().IsNcbistdaa()) {
            StringFromStdaa(bioseq.GetInst().GetSeq_data().GetNcbistdaa().Get(), &m_sequenceString);
            m_isProtein = true;
        }
        // nucleotide formats
        else if (bioseq.GetInst().GetSeq_data().IsIupacna()) {
            m_sequenceString = bioseq.GetInst().GetSeq_data().GetIupacna().Get();
            // convert 'T' to 'U' for RNA
            if (bioseq.GetInst().GetMol() == CSeq_inst::eMol_rna) {
                for (unsigned int i=0; i<m_sequenceString.size(); ++i) {
                    if (m_sequenceString[i] == 'T')
                        m_sequenceString[i] = 'U';
                }
            }
        } else if (bioseq.GetInst().GetSeq_data().IsNcbi4na()) {
            StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi4na().Get(), &m_sequenceString,
                (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));
        } else if (bioseq.GetInst().GetSeq_data().IsNcbi8na()) {  // same repr. for non-X as 4na
            StringFrom4na(bioseq.GetInst().GetSeq_data().GetNcbi8na().Get(), &m_sequenceString,
                (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));
        } else if (bioseq.GetInst().GetSeq_data().IsNcbi2na()) {
            StringFrom2na(bioseq.GetInst().GetSeq_data().GetNcbi2na().Get(), &m_sequenceString,
                (bioseq.GetInst().GetMol() == CSeq_inst::eMol_dna));
            if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() < m_sequenceString.length())
                m_sequenceString.resize(bioseq.GetInst().GetLength());
        }
        else
            THROW_MESSAGE("Sequence::Sequence(): confused by sequence format");
        // check length
        if (bioseq.GetInst().IsSetLength() && bioseq.GetInst().GetLength() != m_sequenceString.length())
            THROW_MESSAGE("Sequence::Sequence() - sequence string length mismatch");
        // force uppercase
        for (unsigned int i=0; i<m_sequenceString.size(); ++i)
            m_sequenceString[i] = toupper(m_sequenceString[i]);
    } else
        THROW_MESSAGE("Sequence::Sequence(): confused by sequence representation");
}
Sequence::~Sequence(void)
{
    BioseqMap::iterator b = bioseqs.find(this);
    if (b != bioseqs.end()) {
        BioseqFree(b->second);
        bioseqs.erase(b);
    }
}
#define RETURN_FIRST_SEQID_THAT_(is) 
    for (i=m_bioseqASN->GetId().begin(); i!=ie; ++i) 
        if ((*i)->is()) 
            return **i
const CSeq_id& Sequence::GetPreferredIdentifier(void) const
{
    CBioseq::TId::const_iterator i, ie = m_bioseqASN->GetId().end();
    // try to find one of these first
    RETURN_FIRST_SEQID_THAT_(IsPdb);
    RETURN_FIRST_SEQID_THAT_(IsGi);
    // otherwise, just use the first one
    return m_bioseqASN->GetId().front().GetObject();
}
bool Sequence::MatchesSeqId(const CSeq_id& seqID) const
{
    CBioseq::TId::const_iterator i, ie = m_bioseqASN->GetId().end();
    for (i=m_bioseqASN->GetId().begin(); i!=ie; ++i) {
        if (seqID.Match(**i))
            return true;
    }
    return false;
}
string Sequence::IdentifierString(void) const
{
    return CSeq_id::GetStringDescr(*m_bioseqASN, CSeq_id::eFormat_FastA);
}
END_SCOPE(struct_util)
/*
* ---------------------------------------------------------------------------
* $Log: su_sequence_set.cpp,v $
* Revision 1000.0  2004/06/01 18:14:43  gouriano
* PRODUCTION: IMPORTED [GCC34_MSVC7] Dev-tree R1.3
*
* Revision 1.3  2004/05/28 09:46:57  thiessen
* restructure C-toolkit header usage ; move C Bioseq storage into su_sequence_set
*
* Revision 1.2  2004/05/25 15:52:18  thiessen
* add BlockMultipleAlignment, IBM algorithm
*
* Revision 1.1  2004/05/24 23:04:05  thiessen
* initial checkin
*
*/