rebase.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:7k
- /*
- * ===========================================================================
- * PRODUCTION $Log: rebase.cpp,v $
- * PRODUCTION Revision 1000.1 2004/06/01 20:55:41 gouriano
- * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.3
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: rebase.cpp,v 1000.1 2004/06/01 20:55:41 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Authors: Josh Cherry
- *
- * File Description: Code to deal with REBASE restriction enzyme database
- *
- */
- #include <ncbi_pch.hpp>
- #include <corelib/ncbistd.hpp>
- #include <algo/sequence/restriction.hpp>
- #include "rebase.hpp"
- BEGIN_NCBI_SCOPE
- CRSpec CRebase::MakeRSpec(const string& site)
- {
- // site contains a string such as
- // GACGTC, GACGT^C, GCAGC(8/12), or (8/13)GAGNNNNNCTC(13/8)
-
- CRSpec spec;
- int plus_cut, minus_cut;
- string s = site;
- if (s[0] == '(') {
- string::size_type idx = s.find_first_of(")");
- if (idx == std::string::npos) {
- throw runtime_error(string("Error parsing site ")
- + site);
- }
- x_ParseCutPair(s.substr(0, idx + 1), plus_cut, minus_cut);
- s.erase(0, idx + 1);
- spec.SetPlusCuts().push_back(-plus_cut);
- spec.SetMinusCuts().push_back(-minus_cut);
- }
- if (s[s.length() - 1] == ')') {
- string::size_type idx = s.find_last_of("(");
- if (idx == std::string::npos) {
- throw runtime_error(string("Error parsing site ")
- + site);
- }
- x_ParseCutPair(s.substr(idx), plus_cut, minus_cut);
- s.erase(idx);
- spec.SetPlusCuts().push_back(plus_cut + s.length());
- spec.SetMinusCuts().push_back(minus_cut + s.length());
- }
- for (unsigned int i = 0; i < s.length(); i++) {
- if (s[i] == '^') {
- // This should be simple. If we have
- // G^GATCC, the cut on the plus strand is
- // at 1, and on the minus it's at the
- // symmetric position, 5.
- // The complication is that TspRI
- // is given as CASTGNN^, not NNCASTGNN^.
- // Someone someday might write NNCASTGNN^,
- // and something like ^NNGATC might arise,
- // so code defensively.
-
- s.erase(i, 1);
- int plus_cut = i;
- // this is the slightly complicated bit
- // trim any leading and trailing N's;
- // in case leading N's removed, adjust plus_cut accordingly
- string::size_type idx = s.find_first_not_of("N");
- if (idx == string::npos) {
- // bizarre situation; all N's (possibly empty)
- s.erase();
- plus_cut = 0;
- } else {
- s.erase(0, idx);
- plus_cut -= idx;
- }
- idx = s.find_last_not_of("N");
- s.erase(idx + 1);
- // plus strand cut
- spec.SetPlusCuts().push_back(plus_cut);
- // symmetric cut on minus strand
- spec.SetMinusCuts().push_back(s.length() - plus_cut);
- break; // there better be just one '^'
- }
- }
- spec.SetSeq(s);
- return spec;
- }
- CREnzyme CRebase::MakeREnzyme(const string& name, const string& sites)
- {
- CREnzyme re;
- re.SetName(name);
-
- vector<string> site_vec;
- NStr::Tokenize(sites, ",", site_vec);
- ITERATE (vector<string>, iter, site_vec) {
- CRSpec spec = CRebase::MakeRSpec(*iter);
- re.SetSpecs().push_back(spec);
- }
-
- return re;
- }
- void CRebase::x_ParseCutPair(const string& s, int& plus_cut, int& minus_cut)
- {
- // s should look like "(8/13)"; plus_cut gets set to 8 and minus_cut to 13
- list<string> l;
- NStr::Split(s.substr(1, s.length() - 2), "/", l);
- if (l.size() != 2) {
- throw runtime_error(string("Couldn't parse cut locations ")
- + s);
- }
- plus_cut = NStr::StringToInt(l.front());
- minus_cut = NStr::StringToInt(l.back());
- }
- void CRebase::ReadNARFormat(istream& input,
- vector<CREnzyme>& enzymes,
- enum EEnzymesToLoad which)
- {
- string line;
- CREnzyme enzyme;
- string name;
- while (getline(input, line)) {
- vector<string> fields;
- NStr::Tokenize(line, "t", fields);
- // the lines we're interested in have more than one field
- if (fields.size() < 2) {
- continue;
- }
- // name of enzyme is in field one or two (field zero is empty)
- name = fields[1];
- if (name == " ") {
- if (which == ePrototype) {
- continue; // skip this non-protype
- }
- name = fields[2];
- }
- if (which == eCommercial && fields[5].empty()) {
- continue; // skip this commercially unavailable enzyme
- }
- string sites = fields[3]; // this contains a comma-separted list of sites,
- // usually just one (but TaqII has two)
- enzyme = MakeREnzyme(name, sites);
- enzymes.push_back(enzyme);
- }
- }
- END_NCBI_SCOPE
- /*
- * ===========================================================================
- * $Log: rebase.cpp,v $
- * Revision 1000.1 2004/06/01 20:55:41 gouriano
- * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.3
- *
- * Revision 1.3 2004/05/21 22:27:47 gorelenk
- * Added PCH ncbi_pch.hpp
- *
- * Revision 1.2 2003/08/21 19:24:16 jcherry
- * Moved restriction site finding to algo/sequence
- *
- * Revision 1.1 2003/08/12 18:52:58 jcherry
- * Initial version
- *
- * ===========================================================================
- */