生物技术

开发平台：
C/C++

rebase.cpp：源码内容
							/*
 * ===========================================================================
 * PRODUCTION $Log: rebase.cpp,v $
 * PRODUCTION Revision 1000.1  2004/06/01 20:55:41  gouriano
 * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.3
 * PRODUCTION
 * ===========================================================================
 */
/*  $Id: rebase.cpp,v 1000.1 2004/06/01 20:55:41 gouriano Exp $
 * ===========================================================================
 *
 *                            PUBLIC DOMAIN NOTICE
 *               National Center for Biotechnology Information
 *
 *  This software/database is a "United States Government Work" under the
 *  terms of the United States Copyright Act.  It was written as part of
 *  the author's official duties as a United States Government employee and
 *  thus cannot be copyrighted.  This software/database is freely available
 *  to the public for use. The National Library of Medicine and the U.S.
 *  Government have not placed any restriction on its use or reproduction.
 *
 *  Although all reasonable efforts have been taken to ensure the accuracy
 *  and reliability of the software and data, the NLM and the U.S.
 *  Government do not and cannot warrant the performance or results that
 *  may be obtained by using this software or data. The NLM and the U.S.
 *  Government disclaim all warranties, express or implied, including
 *  warranties of performance, merchantability or fitness for any particular
 *  purpose.
 *
 *  Please cite the author in any work or product based on this material.
 *
 * ===========================================================================
 *
 * Authors:  Josh Cherry
 *
 * File Description:  Code to deal with REBASE restriction enzyme database
 *
 */
#include <ncbi_pch.hpp>
#include <corelib/ncbistd.hpp>
#include <algo/sequence/restriction.hpp>
#include "rebase.hpp"
BEGIN_NCBI_SCOPE
CRSpec CRebase::MakeRSpec(const string& site)
{
    // site contains a string such as
    // GACGTC, GACGT^C, GCAGC(8/12), or (8/13)GAGNNNNNCTC(13/8)
    
    CRSpec spec;
    int plus_cut, minus_cut;
    string s = site;
    if (s[0] == '(') {
        string::size_type idx = s.find_first_of(")");
        if (idx == std::string::npos) {
            throw runtime_error(string("Error parsing site ")
                                + site);
        }
        x_ParseCutPair(s.substr(0, idx + 1), plus_cut, minus_cut);
        s.erase(0, idx + 1);
        spec.SetPlusCuts().push_back(-plus_cut);
        spec.SetMinusCuts().push_back(-minus_cut);
    }
    if (s[s.length() - 1] == ')') {
        string::size_type idx = s.find_last_of("(");
        if (idx == std::string::npos) {
            throw runtime_error(string("Error parsing site ")
                                + site);
        }
        x_ParseCutPair(s.substr(idx), plus_cut, minus_cut);
        s.erase(idx);
        spec.SetPlusCuts().push_back(plus_cut + s.length());
        spec.SetMinusCuts().push_back(minus_cut + s.length());
    }
    for (unsigned int i = 0;  i < s.length();  i++) {
        if (s[i] == '^') {
            // This should be simple.  If we have
            // G^GATCC, the cut on the plus strand is
            // at 1, and on the minus it's at the
            // symmetric position, 5.
            // The complication is that TspRI
            // is given as CASTGNN^, not NNCASTGNN^.
            // Someone someday might write NNCASTGNN^,
            // and something like ^NNGATC might arise,
            // so code defensively.
            
            s.erase(i, 1);
            int plus_cut = i;
            // this is the slightly complicated bit
            // trim any leading and trailing N's;
            // in case leading N's removed, adjust plus_cut accordingly
            string::size_type idx = s.find_first_not_of("N");
            if (idx == string::npos) {
                // bizarre situation; all N's (possibly empty)
                s.erase();
                plus_cut = 0;
            } else {
                s.erase(0, idx);
                plus_cut -= idx;
            }
            idx = s.find_last_not_of("N");
            s.erase(idx + 1);
            // plus strand cut
            spec.SetPlusCuts().push_back(plus_cut);
            // symmetric cut on minus strand
            spec.SetMinusCuts().push_back(s.length() - plus_cut);
            break;  // there better be just one '^'
        }
    }
    spec.SetSeq(s);
    return spec;
}
CREnzyme CRebase::MakeREnzyme(const string& name, const string& sites)
{
    CREnzyme re;
    re.SetName(name);
    
    vector<string> site_vec;
    NStr::Tokenize(sites, ",", site_vec);
    ITERATE (vector<string>, iter, site_vec) {
        CRSpec spec = CRebase::MakeRSpec(*iter);
        re.SetSpecs().push_back(spec);
    }
    
    return re;
}
void CRebase::x_ParseCutPair(const string& s, int& plus_cut, int& minus_cut)
{
    // s should look like "(8/13)"; plus_cut gets set to 8 and minus_cut to 13
    list<string> l;
    NStr::Split(s.substr(1, s.length() - 2), "/", l);
    if (l.size() != 2) {
        throw runtime_error(string("Couldn't parse cut locations ")
                            + s);
    }
    plus_cut = NStr::StringToInt(l.front());
    minus_cut = NStr::StringToInt(l.back());
}
void CRebase::ReadNARFormat(istream& input,
                            vector<CREnzyme>& enzymes,
                            enum EEnzymesToLoad which)
{
    string line;
    CREnzyme enzyme;
    string name;
    while (getline(input, line)) {
        vector<string> fields;
        NStr::Tokenize(line, "t", fields);
        // the lines we're interested in have more than one field
        if (fields.size() < 2) {
            continue;
        }
        // name of enzyme is in field one or two (field zero is empty)
        name = fields[1];
        if (name == " ") {
            if (which == ePrototype) {
                continue;  // skip this non-protype
            }
            name = fields[2];
        }
        if (which == eCommercial && fields[5].empty()) {
            continue;  // skip this commercially unavailable enzyme
        }
        string sites = fields[3];  // this contains a comma-separted list of sites,
                            // usually just one (but TaqII has two)
        enzyme = MakeREnzyme(name, sites);
        enzymes.push_back(enzyme);
    }
}
END_NCBI_SCOPE
/*
 * ===========================================================================
 * $Log: rebase.cpp,v $
 * Revision 1000.1  2004/06/01 20:55:41  gouriano
 * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.3
 *
 * Revision 1.3  2004/05/21 22:27:47  gorelenk
 * Added PCH ncbi_pch.hpp
 *
 * Revision 1.2  2003/08/21 19:24:16  jcherry
 * Moved restriction site finding to algo/sequence
 *
 * Revision 1.1  2003/08/12 18:52:58  jcherry
 * Initial version
 *
 * ===========================================================================
 */