restriction.hpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:12k
- /*
- * ===========================================================================
- * PRODUCTION $Log: restriction.hpp,v $
- * PRODUCTION Revision 1000.0 2003/10/29 19:28:55 gouriano
- * PRODUCTION PRODUCTION: IMPORTED [ORIGINAL] Dev-tree R1.14
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: restriction.hpp,v 1000.0 2003/10/29 19:28:55 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Authors: Josh Cherry
- *
- * File Description: Classes for representing and finding restriction sites
- *
- */
- #ifndef ALGO_SEQUENCE___RESTRICTION__HPP
- #define ALGO_SEQUENCE___RESTRICTION__HPP
- #include <corelib/ncbistd.hpp>
- #include <corelib/ncbiobj.hpp>
- #include <objmgr/bioseq_handle.hpp>
- #include <algo/sequence/seq_match.hpp>
- #include <util/strsearch.hpp>
- BEGIN_NCBI_SCOPE
- USING_SCOPE(objects);
- ///
- /// This class represents a particular occurrence of a restriction
- /// site on a sequence (not to be confused with a CRSpec, which
- /// represents a *type* of restriction site).
- /// Contains the locations of beginning and end of recognition site,
- /// and vectors of cut sites on plus and minus strands.
- ///
- class NCBI_XALGOSEQ_EXPORT CRSite
- {
- public:
- CRSite(int start, int end);
- // location of recognition sequence
- void SetStart(const int pos);
- int GetStart(void) const;
- void SetEnd(const int pos);
- int GetEnd(void) const;
- // cleavage locations
- // 0 is the bond just before the recognition sequence
- vector<int>& SetPlusCuts(void);
- const vector<int>& GetPlusCuts(void) const;
- vector<int>& SetMinusCuts(void);
- const vector<int>& GetMinusCuts(void) const;
- private:
- int m_Start;
- int m_End;
- vector<int> m_PlusCuts;
- vector<int> m_MinusCuts;
- };
- NCBI_XALGOSEQ_EXPORT ostream& operator<<(ostream& os, const CRSite& site);
- ///////////////////////////////////////////////////////////
- ///////////////////// inline methods //////////////////////
- ///////////////////////////////////////////////////////////
- inline
- CRSite::CRSite(int start, int end)
- {
- m_Start = start;
- m_End = end;
- }
- inline
- vector<int>& CRSite::SetPlusCuts(void)
- {
- return m_PlusCuts;
- }
- inline
- const vector<int>& CRSite::GetPlusCuts(void) const
- {
- return m_PlusCuts;
- }
- inline
- vector<int>& CRSite::SetMinusCuts(void)
- {
- return m_MinusCuts;
- }
- inline
- const vector<int>& CRSite::GetMinusCuts(void) const
- {
- return m_MinusCuts;
- }
- inline
- void CRSite::SetStart(int pos)
- {
- m_Start = pos;
- }
- inline
- int CRSite::GetStart(void) const
- {
- return m_Start;
- }
- inline
- void CRSite::SetEnd(int pos)
- {
- m_End = pos;
- }
- inline
- int CRSite::GetEnd(void) const
- {
- return m_End;
- }
- ///
- /// This class represents a restriction enzyme specificity,
- /// i.e., a sequence recognition pattern and vectors of cleavage
- /// sites on the two strands.
- /// Some known enzymes (e.g., BaeI) have two cleavage sites on each
- /// strand. Some will be represented as having zero because the
- /// cut locations are unknown.
- /// An enzyme may have more than one specificity (TaqII).
- ///
- class NCBI_XALGOSEQ_EXPORT CRSpec
- {
- public:
- // recognition sequence
- void SetSeq(const string& s);
- string& SetSeq(void);
- const string& GetSeq(void) const;
- // cleavage locations
- // 0 is the bond just before the recognition sequence
- vector<int>& SetPlusCuts(void);
- const vector<int>& GetPlusCuts(void) const;
- vector<int>& SetMinusCuts(void);
- const vector<int>& GetMinusCuts(void) const;
- // compare
- bool operator==(const CRSpec& rhs) const {
- return m_Seq == rhs.m_Seq
- && m_PlusCuts == rhs.m_PlusCuts
- && m_MinusCuts == rhs.m_MinusCuts;
- }
- bool operator!=(const CRSpec& rhs) const {
- return !(*this == rhs);
- }
- bool operator<(const CRSpec& rhs) const;
- // reset everything
- void Reset(void);
- private:
- string m_Seq;
- vector<int> m_PlusCuts;
- vector<int> m_MinusCuts;
- };
- ///////////////////////////////////////////////////////////
- ///////////////////// inline methods //////////////////////
- ///////////////////////////////////////////////////////////
- inline
- void CRSpec::SetSeq(const string& s)
- {
- m_Seq = s;
- }
- inline
- string& CRSpec::SetSeq(void)
- {
- return m_Seq;
- }
- inline
- const string& CRSpec::GetSeq(void) const
- {
- return m_Seq;
- }
- inline
- vector<int>& CRSpec::SetPlusCuts(void)
- {
- return m_PlusCuts;
- }
- inline
- const vector<int>& CRSpec::GetPlusCuts(void) const
- {
- return m_PlusCuts;
- }
- inline
- vector<int>& CRSpec::SetMinusCuts(void)
- {
- return m_MinusCuts;
- }
- inline
- const vector<int>& CRSpec::GetMinusCuts(void) const
- {
- return m_MinusCuts;
- }
- ///
- /// This class represents a restriction enzyme
- /// (an enzyme name and a vector of cleavage specificities)
- ///
- class NCBI_XALGOSEQ_EXPORT CREnzyme
- {
- public:
- // name of enzyme
- void SetName(const string& s);
- string& SetName(void);
- const string& GetName(void) const;
- // cleavage specificities
- // (usually just one, but TaqII has two)
- vector<CRSpec>& SetSpecs(void);
- const vector<CRSpec>& GetSpecs(void) const;
- // reset everything
- void Reset(void);
- // Given a vector of CREnzyme, lump together all
- // enzymes with identical specificities.
- // The cleavage sites must be the same for specificities
- // to be considered indentical (in addition to the
- // recognition sequenence).
- static void CombineIsoschizomers(vector<CREnzyme>& enzymes);
- private:
- string m_Name;
- vector<CRSpec> m_Specs;
- };
- ///////////////////////////////////////////////////////////
- ///////////////////// inline methods //////////////////////
- ///////////////////////////////////////////////////////////
- inline
- void CREnzyme::SetName(const string& s)
- {
- m_Name = s;
- }
- inline
- string& CREnzyme::SetName(void)
- {
- return m_Name;
- }
- inline
- const string& CREnzyme::GetName(void) const
- {
- return m_Name;
- }
- inline
- vector<CRSpec>& CREnzyme::SetSpecs(void)
- {
- return m_Specs;
- }
- inline
- const vector<CRSpec>& CREnzyme::GetSpecs(void) const
- {
- return m_Specs;
- }
- ///
- /// This class represents the results of a search for sites
- /// of a particular enzyme.
- /// It merely packages an enzyme name, a vector of
- /// definite sites, and a vector of possible sites
- ///
- class CREnzResult : public CObject
- {
- public:
- CREnzResult(const string& enzyme_name) : m_EnzymeName(enzyme_name) {}
- CREnzResult(const string& enzyme_name,
- const vector<CRSite>& definite_sites,
- const vector<CRSite>& possible_sites);
- // member access functions
- const string& GetEnzymeName(void) const {return m_EnzymeName;}
- vector<CRSite>& SetDefiniteSites(void) {return m_DefiniteSites;}
- const vector<CRSite>& GetDefiniteSites(void) const
- {
- return m_DefiniteSites;
- }
- vector<CRSite>& SetPossibleSites(void) {return m_PossibleSites;}
- const vector<CRSite>& GetPossibleSites(void) const
- {
- return m_PossibleSites;
- }
- private:
- string m_EnzymeName;
- vector<CRSite> m_DefiniteSites;
- vector<CRSite> m_PossibleSites;
- };
- NCBI_XALGOSEQ_EXPORT ostream& operator<<(ostream& os, const CREnzResult& er);
- ///////////////////////////////////////////////////////////
- ///////////////////// inline methods //////////////////////
- ///////////////////////////////////////////////////////////
- inline
- CREnzResult::CREnzResult(const string& enzyme_name,
- const vector<CRSite>& definite_sites,
- const vector<CRSite>& possible_sites)
- {
- m_EnzymeName = enzyme_name;
- m_DefiniteSites = definite_sites;
- m_PossibleSites = possible_sites;
- }
- /// this class contains the static member functions Find,
- /// which find restriction sites in a sequence
- class NCBI_XALGOSEQ_EXPORT CFindRSites
- {
- public:
- static void Find(const string& seq,
- const vector<CREnzyme>& enzymes,
- vector<CRef<CREnzResult> >& results);
- static void Find(const vector<char>& seq,
- const vector<CREnzyme>& enzymes,
- vector<CRef<CREnzResult> >& results);
- static void Find(const CSeqVector& seq,
- const vector<CREnzyme>& enzymes,
- vector<CRef<CREnzResult> >& results);
- private:
- static void x_ExpandRecursion(string& s, unsigned int pos,
- CTextFsm<int>& fsm, int match_value);
- static void x_AddPattern(const string& pat, CTextFsm<int>& fsm,
- int match_value);
- static bool x_IsAmbig(char nuc);
- template<class Seq>
- friend void x_FindRSite(const Seq& seq, const vector<CREnzyme>& enzymes,
- vector<CRef<CREnzResult> >& results);
- };
- END_NCBI_SCOPE
- #endif // ALGO_SEQUENCE___RESTRICTION__HPP
- /*
- * ===========================================================================
- * $Log: restriction.hpp,v $
- * Revision 1000.0 2003/10/29 19:28:55 gouriano
- * PRODUCTION: IMPORTED [ORIGINAL] Dev-tree R1.14
- *
- * Revision 1.14 2003/08/22 14:25:53 ucko
- * Fix for MSVC, which seems to have problems with member templates.
- *
- * Revision 1.13 2003/08/22 02:17:13 ucko
- * Fix WorkShop compilation.
- *
- * Revision 1.12 2003/08/21 19:21:44 jcherry
- * Moved restriction site finding to algo/sequence
- *
- * Revision 1.11 2003/08/21 18:38:31 jcherry
- * Overloaded CFindRSites::Find to take several sequence containers.
- * Added option to lump together enzymes with identical specificities.
- *
- * Revision 1.10 2003/08/20 22:57:44 jcherry
- * Reimplemented restriction site finding using finite state machine
- *
- * Revision 1.9 2003/08/18 19:24:15 jcherry
- * Moved orf and seq_match to algo/sequence
- *
- * Revision 1.8 2003/08/18 13:52:22 jcherry
- * Added operator!= for CRSpec (possible fix for MIPS, which seems to
- * use != in computing vector ==)
- *
- * Revision 1.7 2003/08/17 19:25:30 jcherry
- * Changed member variable names to follow convention
- *
- * Revision 1.6 2003/08/15 16:57:17 jcherry
- * For consecutive enzymes with identical specificities, reuse
- * search results. This saves a bunch of time.
- *
- * Revision 1.5 2003/08/15 15:26:12 jcherry
- * Changed so that restriction site searching (CFindRSites::Find) returns
- * a vector of CRefs rather than a vector of objects. This speeds sorting.
- *
- * Revision 1.4 2003/08/13 17:40:26 dicuccio
- * Formatting fixes. Changes some pass-by-val to pass-by-reference. Fixed
- * complement table
- *
- * Revision 1.3 2003/08/13 16:42:11 dicuccio
- * Compilation fixes for MSVC
- *
- * Revision 1.2 2003/08/13 12:37:58 dicuccio
- * Partial compilation fixes for Windows
- *
- * Revision 1.1 2003/08/12 18:52:58 jcherry
- * Initial version
- *
- * ===========================================================================
- */