seqdb.hpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:13k
- /*
- * ===========================================================================
- * PRODUCTION $Log: seqdb.hpp,v $
- * PRODUCTION Revision 1000.1 2004/04/21 19:27:30 gouriano
- * PRODUCTION PRODUCTION: UPGRADED [CATCHUP_003] Dev-tree R1.17
- * PRODUCTION
- * ===========================================================================
- */
- #ifndef OBJTOOLS_READERS_SEQDB__SEQDB_HPP
- #define OBJTOOLS_READERS_SEQDB__SEQDB_HPP
- /* $Id: seqdb.hpp,v 1000.1 2004/04/21 19:27:30 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Author: Kevin Bealer
- *
- */
- /// @file seqdb.hpp
- /// Defines BLAST database access classes.
- ///
- /// Defines classes:
- /// CSeqDB
- /// CSeqDBSequence
- ///
- /// Implemented for: UNIX, MS-Windows
- #include <objtools/readers/seqdb/seqdbcommon.hpp>
- #include <objects/blastdb/Blast_def_line.hpp>
- #include <objects/blastdb/Blast_def_line_set.hpp>
- #include <objects/general/Dbtag.hpp>
- #include <objects/general/Object_id.hpp>
- #include <objects/seq/Bioseq.hpp>
- #include <objects/seqloc/Seq_id.hpp>
- BEGIN_NCBI_SCOPE
- USING_SCOPE(objects);
- /////////////////////////////////////////////////////////////////////////////
- ///
- /// CSeqDBIter --
- ///
- /// Small class to iterate over a seqdb database.
- ///
- /// This serves something of the same role for a CSeqDB object that a
- /// vector iterator might serve in the standard template library.
- class CSeqDB;
- class CSeqDBIter {
- public:
- typedef Uint4 TOID;
-
- virtual ~CSeqDBIter()
- {
- x_RetSeq();
- }
-
- CSeqDBIter & operator++(void);
-
- TOID GetOID(void)
- {
- return m_OID;
- }
-
- const char * GetData(void)
- {
- return m_Data;
- }
-
- Uint4 GetLength(void)
- {
- return m_Length;
- }
-
- operator bool()
- {
- return m_Length != (Uint4)-1;
- }
-
- CSeqDBIter(const CSeqDBIter &);
-
- CSeqDBIter & operator =(const CSeqDBIter &);
-
- private:
- inline void x_GetSeq(void);
- inline void x_RetSeq(void);
-
- friend class CSeqDB;
-
- CSeqDBIter(const CSeqDB *, Uint4 oid);
-
- const CSeqDB * m_DB;
- TOID m_OID;
- const char * m_Data;
- Uint4 m_Length;
- };
- /////////////////////////////////////////////////////////////////////////////
- ///
- /// CSeqDB --
- ///
- /// User interface class for blast databases.
- ///
- /// This class provides the top-level interface class for BLAST
- /// database users. It defines access to the database component by
- /// calling methods on objects which represent the various database
- /// files, such as the index, header, sequence, and alias files.
- class NCBI_XOBJREAD_EXPORT CSeqDB : public CObject {
- public:
- /// Sequence type accepted and returned for OID indexes.
- typedef Uint4 TOID;
-
- /// Short Constructor.
- ///
- /// @param dbname
- /// A list of database or alias names, seperated by spaces.
- /// @param prot_nucl
- /// Either kSeqTypeProt for a protein database, or kSeqTypeNucl
- /// for nucleotide. These can also be specified as 'p' or 'n'.
-
- CSeqDB(const string & dbname, char prot_nucl);
-
- /// Constructor with MMap Flag and OID Range.
- ///
- /// @param dbname
- /// A list of database or alias names, seperated by spaces.
- /// @param prot_nucl
- /// Either kSeqTypeProt for a protein database, or kSeqTypeNucl
- /// for nucleotide. These can also be specified as 'p' or 'n'.
- /// @param oid_begin
- /// Iterator will skip OIDs less than this value. Only OIDs
- /// found in the OID lists (if any) will be returned.
- /// @param oid_end
- /// Iterator will return up to (but not including) this OID.
- /// @param use_mmap
- /// If kSeqDBMMap is specified (the default), memory mapping is
- /// attempted. If kSeqDBNoMMap is specified, or memory mapping
- /// fails, this platform does not support it, the less efficient
- /// read and write calls are used instead.
-
- CSeqDB(const string & dbname,
- char prot_nucl,
- Uint4 oid_begin,
- Uint4 oid_end,
- bool use_mmap);
-
- /// Destructor.
- ///
- /// This will return resources acquired by this object, including
- /// any gotten by the GetSequence() call, whether or not they have
- /// been returned by RetSequence().
- ~CSeqDB();
-
-
- /// Returns the sequence length in base pairs or residues.
- Uint4 GetSeqLength(TOID oid) const;
-
- /// Returns an unbiased, approximate sequence length.
- ///
- /// For protein DBs, this method is identical to GetSeqLength().
- /// In the nucleotide case, computing the exact length requires
- /// examination of the sequence data. This method avoids doing
- /// that, returning an approximation ranging from L-3 to L+3
- /// (where L indicates the exact length), and unbiased on average.
- Uint4 GetSeqLengthApprox(TOID oid) const;
-
- /// Get the ASN.1 header for the sequence.
- CRef<CBlast_def_line_set> GetHdr(TOID oid) const;
-
- /// Get a CBioseq of the sequence.
- CRef<CBioseq> GetBioseq(TOID oid,
- bool use_objmgr = true,
- bool insert_ctrlA = false) const;
-
- /// Get a pointer to raw sequence data.
- ///
- /// Get the raw sequence (strand data). When done, resources
- /// should be returned with RetSequence. This data pointed to
- /// by *buffer is in read-only memory (where supported).
- /// @return
- /// The return value is the sequence length (in base pairs or
- /// residues). In case of an error, an exception is thrown.
- Uint4 GetSequence(TOID oid, const char ** buffer) const;
-
- /// Get a pointer to sequence data with embedded ambiguities.
- ///
- /// In the protein case, this is identical to GetSequence(). In
- /// the nucleotide case, it stores 2 bases per byte instead of 4.
- /// The third parameter indicates the encoding for nucleotide
- /// data, either kSeqDBNucl4NA or kSeqDBNuclBlastNA, ignored if
- /// the sequence is a protein sequence. When done, resources
- /// should be returned with RetSequence.
- /// @return
- /// The return value is the sequence length (in base pairs or
- /// residues). In case of an error, an exception is thrown.
- Uint4 GetAmbigSeq(TOID oid, const char ** buffer, Uint4 nucl_code) const;
-
- /// Get a pointer to sequence data with embedded ambiguities.
- ///
- /// This is like GetAmbigSeq(), but the allocated object should be
- /// deleted by the caller. This is intended for users who are
- /// going to modify the sequence data, or are going to mix the
- /// data into a container with other data, and who are mixing data
- /// from multiple sources and want to free the data in the same
- /// way. The fourth parameter should be given one of the values
- /// from EAllocStrategy; the corresponding method should be used
- /// to delete the object. Note that "delete[]" should be used
- /// instead of "delete"
- /// @param oid
- /// Ordinal ID.
- /// @param buffer
- /// Address of a char pointer to access the sequence data.
- /// @param nucl_code
- /// The NA encoding, kSeqDBNuclNcbiNA8 or kSeqDBNuclBlastNA8.
- /// @param alloc_strategy
- /// Indicate which allocation strategy to use.
- Uint4 GetAmbigSeqAlloc(TOID oid,
- char ** buffer,
- Uint4 nucl_code,
- ESeqDBAllocType strategy) const;
-
- /// Returns any resources associated with the sequence.
- ///
- /// Note that if memory mapping was successful, the sequence data
- /// is in read only memory; also, this method has no effect. If
- /// memory mapping failed, the sequence is probably in dynamically
- /// allocated memory and this method frees that memory.
- void RetSequence(const char ** buffer) const;
-
- /// Gets a list of sequence identifiers.
- ///
- /// This returns the list of CSeq_id identifiers associated with
- /// the sequence specified by the given OID.
- list< CRef<CSeq_id> > GetSeqIDs(TOID oid) const;
-
- /// Returns the type of database opened - protein or nucleotide.
- ///
- /// This uses the same constants as the constructor.
- char GetSeqType(void) const;
-
- /// Returns the database title.
- ///
- /// This is usually read from database volumes or alias files. If
- /// multiple databases were passed to the constructor, this will
- /// be a concatenation of those databases' titles.
- string GetTitle(void) const;
-
- /// Returns the construction date of the database.
- ///
- /// This is encoded in the database. If multiple databases or
- /// multiple volumes were accessed, the first available date will
- /// be used.
- string GetDate(void) const;
-
- /// Returns the number of sequences available.
- Uint4 GetNumSeqs(void) const;
-
- /// Returns the sum of the lengths of all available sequences.
- ///
- /// This uses summary information stored in the database volumes
- /// or alias files. It provides an exact value, without iterating
- /// over individual sequences.
- Uint8 GetTotalLength(void) const;
-
- /// Returns the length of the largest sequence in the database.
- ///
- /// This uses summary information stored in the database volumes
- /// or alias files. This might be used to chose buffer sizes.
- Uint4 GetMaxLength(void) const;
-
- /// Returns a sequence iterator.
- ///
- /// This gets an iterator designed to allow traversal of the
- /// database from beginning to end.
- CSeqDBIter Begin(void) const;
-
- /// Find an included OID, incrementing next_oid if necessary.
- ///
- /// If the specified OID is not included in the set (i.e. the OID
- /// mask), the input parameter is incremented until one is found
- /// that is. The user will probably want to increment between
- /// calls, if iterating over the db.
- /// @return
- /// True if a valid OID was found, false otherwise.
- bool CheckOrFindOID(TOID & next_oid) const;
-
- /// Get list of database names.
- ///
- /// This returns the database name list used at construction.
- /// @return
- /// List of database names.
- const string & GetDBNameList(void) const;
-
- private:
- /// Implementation details are hidden. (See seqdbimpl.hpp).
- class CSeqDBImpl * m_Impl;
- };
- /////////////////////////////////////////////////////////////////////////////
- ///
- /// CSeqDBSequence --
- ///
- /// Small class to manage return of sequence data.
- ///
- /// The CSeqDB class requires that sequences be returned at some point
- /// after they are gotten. This class provides that service via the
- /// destructor. It also insures that the database itself stays around
- /// for at least the duration of its lifetime, by holding a CRef<> to
- /// that object. CSeqDB::GetSequence may be used directly to avoid
- /// the small overhead of this class, provided care is taken to call
- /// CSeqDB::RetSequence. The data referred to by this object is not
- /// modifyable, and is memory mapped (read only) where supported.
- class CSeqDBSequence {
- public:
- CSeqDBSequence(CSeqDB * db, Uint4 oid)
- : m_DB (db),
- m_Data (0),
- m_Length(0)
- {
- m_Length = m_DB->GetSequence(oid, & m_Data);
- }
-
- ~CSeqDBSequence()
- {
- if (m_Data) {
- m_DB->RetSequence(& m_Data);
- }
- }
-
- const char * GetData(void)
- {
- return m_Data;
- }
-
- Uint4 GetLength(void)
- {
- return m_Length;
- }
-
- private:
- // Prevent copy for now - would be easy to fix.
- CSeqDBSequence(const CSeqDBSequence &);
- CSeqDBSequence & operator=(const CSeqDBSequence &);
-
- CRef<CSeqDB> m_DB;
- const char * m_Data;
- Uint4 m_Length;
- };
- // Inline methods for CSeqDBIter
- void CSeqDBIter::x_GetSeq(void)
- {
- m_Length = m_DB->GetSequence(m_OID, & m_Data);
- }
- void CSeqDBIter::x_RetSeq(void)
- {
- if (m_Data)
- m_DB->RetSequence(& m_Data);
- }
- END_NCBI_SCOPE
- #endif // OBJTOOLS_READERS_SEQDB__SEQDB_HPP