生物技术

开发平台：
C/C++

restriction_sites.cpp：源码内容
							/*
 * ===========================================================================
 * PRODUCTION $Log: restriction_sites.cpp,v $
 * PRODUCTION Revision 1000.5  2004/06/01 20:55:43  gouriano
 * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.32
 * PRODUCTION
 * ===========================================================================
 */
/*  $Id: restriction_sites.cpp,v 1000.5 2004/06/01 20:55:43 gouriano Exp $
 * ===========================================================================
 *
 *                            PUBLIC DOMAIN NOTICE
 *               National Center for Biotechnology Information
 *
 *  This software/database is a "United States Government Work" under the
 *  terms of the United States Copyright Act.  It was written as part of
 *  the author's official duties as a United States Government employee and
 *  thus cannot be copyrighted.  This software/database is freely available
 *  to the public for use. The National Library of Medicine and the U.S.
 *  Government have not placed any restriction on its use or reproduction.
 *
 *  Although all reasonable efforts have been taken to ensure the accuracy
 *  and reliability of the software and data, the NLM and the U.S.
 *  Government do not and cannot warrant the performance or results that
 *  may be obtained by using this software or data. The NLM and the U.S.
 *  Government disclaim all warranties, express or implied, including
 *  warranties of performance, merchantability or fitness for any particular
 *  purpose.
 *
 *  Please cite the author in any work or product based on this material.
 *
 * ===========================================================================
 *
 * Authors:  Josh Cherry
 *
 * File Description:  gbench plugin for finding restriction sites
 *
 */
#include <ncbi_pch.hpp>
#include "restriction_sites.hpp"
#include "rebase.hpp"
#include <algo/sequence/restriction.hpp>
#include <algo/sequence/seq_match.hpp>
#include <algorithm>
#include <corelib/ncbiapp.hpp>
#include <corelib/ncbireg.hpp>
#include <gui/core/plugin_utils.hpp>
#include <gui/utils/system_path.hpp>
#include <gui/core/version.hpp>
#include <gui/dialogs/col/multi_col_dlg.hpp>
#include <gui/plugin/PluginCommandSet.hpp>
#include <gui/plugin/PluginInfo.hpp>
#include <gui/plugin/PluginRequest.hpp>
#include <gui/plugin/PluginValueConstraint.hpp>
#include <gui/utils/message_box.hpp>
#include <gui/objutils/utils.hpp>
#include <objects/general/Dbtag.hpp>
#include <objects/general/Object_id.hpp>
#include <objects/seqfeat/Rsite_ref.hpp>
#include <objects/seqloc/Seq_point.hpp>
#include <objmgr/seq_vector.hpp>
#include <objmgr/util/sequence.hpp>
BEGIN_NCBI_SCOPE
CAlgoPlugin_RestrictionSites::~CAlgoPlugin_RestrictionSites()
{
}
// standard plugin announce bopilerplate
void CAlgoPlugin_RestrictionSites::GetInfo(CPluginInfo& info)
{
    info.Reset();
    
    // version info macro
    info.SetInfo(CPluginVersion::eMajor, CPluginVersion::eMinor, 0,
                 string(__DATE__) + " " + string(__TIME__),
                 "CAlgoPlugin_RestrictionSites", "Search/Find restriction sites",
                 "Search a DNA sequence for restriction sites",
                 "");
    // command info
    CPluginCommandSet& cmds = info.SetCommands();
    CPluginCommand&    args = cmds.AddAlgoCommand(eAlgoCommand_run);
    args.AddArgument("locs", "Locations to evaluate",
                     CSeq_loc::GetTypeInfo(),
                     CPluginArg::TData::e_Array);
    args.SetConstraint("locs",
                       (*CPluginValueConstraint::CreateSeqMol(),
                        CSeq_inst::eMol_na,
                        CSeq_inst::eMol_dna,
                        CSeq_inst::eMol_rna));
    args.AddDefaultArgument("which_enzymes", "Which restriction enzymes",
                            CPluginArg::eString, "commercially available");
    args.SetConstraint("which_enzymes", (*CPluginValueConstraint::CreateSet(),
                                "commercially available",
                                "prototypes", "all"));
    args.AddDefaultArgument("combine_isoschizomers",
                            "Combine isoschizomers",
                            CPluginArg::eBoolean, "true");
    args.AddArgument("sort_by_number_of_sites",
                     "Sort results by number of (definite) cuts by enzyme",
                     CPluginArg::eBoolean);
}
// helper functor for sorting CRef<CREnzResult> by the enzyme name
struct SEnzymeNameCompare
{
    bool operator()
        (const CRef<CREnzResult>& lhs, const CRef<CREnzResult>& rhs) const
    {
        return lhs->GetEnzymeName() < rhs->GetEnzymeName();
    }
};
// helper functor for sorting CREnzyme by the enzyme name
struct SNameCompare
{
    bool operator()
        (const CREnzyme& lhs, const CREnzyme& rhs) const
    {
        return lhs.GetName() < rhs.GetName();
    }
};
// helper functor for sorting by the number of definite sites
struct SLessDefSites
{
    bool operator() 
        (const CRef<CREnzResult>& lhs, const CRef<CREnzResult>& rhs) const
    {
        return lhs->GetDefiniteSites().size() < rhs->GetDefiniteSites().size();
    }
};
// helper functor for sorting CRef<CSeq_loc>s by location
struct SLessSeq_loc
{
    bool operator() 
        (const CRef<CSeq_loc>& lhs, const CRef<CSeq_loc>& rhs) const
    {
        return (lhs->Compare(*rhs) < 0);
    }
};
static
void s_AddSitesToAnnot(const vector<CRSite>& sites,
                       const CREnzResult&    result,
                       CSeq_annot&           annot,
                       const CSeq_loc&       parent_loc,
                       bool                  definite = true)
{
    const CSeq_id& id = sequence::GetId(parent_loc);
    ITERATE (vector<CRSite>, site, sites) {
        // create feature
        CRef<CSeq_feat> feat(new CSeq_feat());
        // start to set up Rsite
        feat->SetData().SetRsite().SetDb().SetDb("REBASE");
        feat->SetData().SetRsite().SetDb()
            .SetTag().SetStr("REBASE");
        string str(result.GetEnzymeName());
        if ( !definite ) {
            str = "Possible " + str;
        }
        feat->SetData().SetRsite().SetStr(str);
        //
        // build our location
        //
        vector< CRef<CSeq_loc> > locs;
        // a loc for the recognition site
        CRef<CSeq_loc> recog_site(new CSeq_loc);
        recog_site->SetInt().SetFrom(site->GetStart());
        recog_site->SetInt().SetTo  (site->GetEnd());
        recog_site->SetId(id);
        locs.push_back(recog_site);
        // locs for the cleavage sites
        int negative_cut_locs = 0;  // count these exceptions
        ITERATE (vector<int>, cut, site->GetPlusCuts()) {
            if (*cut >= 0 ) {
                CRef<CSeq_loc> cut_site(new CSeq_loc);
                cut_site->SetPnt().SetPoint(*cut);
                // indicate that the cut is to the "left"
                cut_site->SetPnt()
                    .SetFuzz().SetLim(CInt_fuzz::eLim_tl);
                cut_site->SetPnt().SetStrand(eNa_strand_plus);
                cut_site->SetId(id);
                locs.push_back(cut_site);
            } else {
                negative_cut_locs++;
            }
        }
        ITERATE (vector<int>, cut, site->GetMinusCuts()) {
            if (*cut >= 0 ) {
                CRef<CSeq_loc> cut_site(new CSeq_loc);
                cut_site->SetPnt().SetPoint(*cut);
                // indicate that the cut is to the "left"
                cut_site->SetPnt()
                    .SetFuzz().SetLim(CInt_fuzz::eLim_tl);
                cut_site->SetPnt().SetStrand(eNa_strand_minus);
                cut_site->SetId(id);
                locs.push_back(cut_site);
            } else {
                negative_cut_locs++;
            }
        }
        // comment for those few cases where there are
        // cuts before the sequence begins
        if (negative_cut_locs > 0) {
            string a_comm = NStr::IntToString(negative_cut_locs)
                + " cleavage sites are located before the"
                " beginning of the sequence and are not reported";
            feat->SetComment(a_comm);
        }
        sort(locs.begin(), locs.end(), SLessSeq_loc());
        copy(locs.begin(), locs.end(),
             back_inserter(feat->SetLocation().SetMix().Set()));
        feat->SetLocation
            (*CSeqUtils::RemapChildToParent(parent_loc, feat->GetLocation()));
        // save in annot
        annot.SetData().SetFtable().push_back(feat);
    }
}
void CAlgoPlugin_RestrictionSites::RunCommand(CPluginMessage& msg)
{
    const CPluginCommand& args = msg.GetRequest().GetCommand();
    CPluginReply& reply = msg.SetReply();
    _TRACE("CAlgoPlugin_RestrictionSites::RunCommand()");
    
    // load patterns from file
    CRebase::EEnzymesToLoad which_enzymes 
        = x_DecodeWhichEnzymes(args["which_enzymes"].AsString());
    vector<CREnzyme> enzymes;
    try {
        x_LoadREnzymeData(enzymes, which_enzymes);
    }
    catch (const exception& e) {
        NcbiMessageBox(e.what());
        reply.SetStatus(eMessageStatus_failed);
        return;
    }
    // optionally lump together all enzymes with identical specificities
    if (args["combine_isoschizomers"].AsBoolean()) {
        // first sort alphabetically by enzyme name
        sort(enzymes.begin(), enzymes.end(), SNameCompare());
        // now combine isoschizomers
        CREnzyme::CombineIsoschizomers(enzymes);
    }
    if ( !m_Dialog.get() ) {
        m_Dialog.reset(new CMultiColDlg());
        m_Dialog->SetWindowSize(1000, 500);
        m_Dialog->SetTitle("Restriction Sites");
        
        m_Dialog->SetColumn(0, "Sequence", FL_ALIGN_LEFT, 0.3f);
        m_Dialog->SetColumn(1, "Location", FL_ALIGN_LEFT, 0.5f);
        m_Dialog->SetColumn(2, "Enzyme", FL_ALIGN_LEFT, 0.4f);
        m_Dialog->SetColumn(3, "Number of Sites", FL_ALIGN_CENTER, 0.5f);
        m_Dialog->SetColumn(4, "Recog. Site Loc.", FL_ALIGN_CENTER, 0.75f);
        m_Dialog->SetColumn(5, "Plus Strand Cuts", FL_ALIGN_CENTER, 0.75f);
        m_Dialog->SetColumn(6, "Minus Strand Cuts", FL_ALIGN_CENTER, 0.75f);
    }
    m_Dialog->SetRows(0);  // to clear any previous contents
    int row = 0;
    plugin_args::TLocList locs;
    GetArgValue(args["locs"], locs);
    ITERATE (plugin_args::TLocList, iter, locs) {
        const CSeq_loc&  loc = *iter->second;
        const IDocument& doc = *iter->first;
        // find the best ID for this bioseq
        try {
            CBioseq_Handle handle = doc.GetScope().GetBioseqHandle(loc);
            // get sequence in binary (8na) form
            CSeqVector vec =
                handle.GetSequenceView(loc,
                                       CBioseq_Handle::eViewConstructed,
                                       CBioseq_Handle::eCoding_Ncbi);
            string& id_str  = m_Dialog->SetCell(row, 0);
            string& loc_str = m_Dialog->SetCell(row, 1);
            const CSeq_id& best_id =
                sequence::GetId(handle, sequence::eGetId_Best);
            id_str.erase();
            best_id.GetLabel(&id_str);
            loc_str = CPluginUtils::GetLabel(loc, &doc.GetScope());
            // a new feature table
            CRef<CSeq_annot> annot(new CSeq_annot());
            // a place to store results (one per enzyme)
            typedef vector<CRef<CREnzResult> > TResults;
            TResults results;
            // do the big search
            CFindRSites::Find(vec, enzymes, results);
            // add description to annot
            annot->SetName("Restriction sites");
            // deal with stored enzyme results
            sort(results.begin(), results.end(), SEnzymeNameCompare());
            if (args["sort_by_number_of_sites"].AsBoolean()) {
                // do a stablesort so alphabetical order preserved
                // within sets with the same number of sites
                stable_sort(results.begin(), results.end(), SLessDefSites());
            }
            // count the number of definite and possible sites
            int total_definite_sites = 0, total_possible_sites = 0;
            int total_non_cutters = 0;
            ITERATE (TResults, result, results) {
                total_definite_sites += (*result)->GetDefiniteSites().size();
                total_possible_sites += (*result)->GetPossibleSites().size();
                if ((*result)->GetDefiniteSites().size()
                    + (*result)->GetPossibleSites().size() == 0) {
                    total_non_cutters++;
                }
            }
            _TRACE("Found " << total_definite_sites << " definite and "
                   << total_possible_sites << " possible sites");
            // in order to build dialog efficiently,
            // pre-allocate rows
            int total_rows = total_definite_sites + total_possible_sites
                + total_non_cutters;
            m_Dialog->SetRows(row + total_rows);
            ITERATE (TResults, result, results) {
                
                //
                // add sites to dialog
                //
                int starting_row = row;
                m_Dialog->SetCell(row, 2) = (*result)->GetEnzymeName();
                string num_sites =
                    NStr::IntToString((*result)->GetDefiniteSites().size());
                if (!(*result)->GetPossibleSites().empty()) {
                    num_sites += "+";
                    num_sites +=
                        NStr::IntToString((*result)
                                          ->GetPossibleSites().size());
                    num_sites += "?";
                }
                m_Dialog->SetCell(row, 3) = num_sites;
                const vector<CRSite>& definite_sites
                    = (*result)->GetDefiniteSites();
                ITERATE (vector<CRSite>, site, definite_sites) {
                    string& pos_str = m_Dialog->SetCell(row, 4);
                    pos_str = NStr::IntToString(site->GetStart() + 1) + " - "
                        + NStr::IntToString(site->GetEnd() + 1);
                    string plus_cuts;
                    const vector<int>& pcuts = site->GetPlusCuts();
                    ITERATE (vector<int>, cut, pcuts) {
                        if (!plus_cuts.empty()) {
                            plus_cuts += ", ";
                        }
                        plus_cuts += NStr::IntToString(*cut);
                    }
                    m_Dialog->SetCell(row, 5) = plus_cuts;
                    string minus_cuts;
                    const vector<int>& mcuts = site->GetMinusCuts();
                    ITERATE (vector<int>, cut, mcuts) {
                        if (!minus_cuts.empty()) {
                            minus_cuts += ", ";
                        }
                        minus_cuts += NStr::IntToString(*cut);
                    }
                    m_Dialog->SetCell(row, 6) = minus_cuts;
                    ++row;
                }
                const vector<CRSite>& possible_sites
                    = (*result)->GetPossibleSites();
                ITERATE (vector<CRSite>, site, possible_sites) {
                    string& pos_str = m_Dialog->SetCell(row, 4);
                    pos_str = string("???")
                        + NStr::IntToString(site->GetStart() + 1) + " - "
                        + NStr::IntToString(site->GetEnd() + 1);
                    string plus_cuts;
                    const vector<int>& pcuts = site->GetPlusCuts();
                    ITERATE (vector<int>, cut, pcuts) {
                        if (!plus_cuts.empty()) {
                            plus_cuts += ", ";
                        }
                        plus_cuts += NStr::IntToString(*cut);
                    }
                    m_Dialog->SetCell(row, 5) = plus_cuts;
                    string minus_cuts;
                    const vector<int>& mcuts = site->GetMinusCuts();
                    ITERATE (vector<int>, cut, mcuts) {
                        if (!minus_cuts.empty()) {
                            minus_cuts += ", ";
                        }
                        minus_cuts += NStr::IntToString(*cut);
                    }
                    m_Dialog->SetCell(row, 6) = minus_cuts;
                    ++row;
                }
                if (row == starting_row) {
                    row++;
                }
                //
                // add features to annot
                //
                s_AddSitesToAnnot(definite_sites, **result, *annot, loc);
                s_AddSitesToAnnot(possible_sites, **result, *annot, loc, false);
            }
            // attach annot to doc
            //const_cast<IDocument&>(doc).AttachAnnot(*annot);
            //const_cast<IDocument&>(doc).UpdateAllViews(fDocumentChanged);
            reply.AddObject(doc, *annot);
        }
        catch (exception& e) {
            LOG_POST(Error << e.what());
            string str = CPluginUtils::GetLabel(loc, &doc.GetScope());
            LOG_POST(Error << "Error processing location " << str);
        }
#ifndef _DEBUG
        catch (...) {
            string str = CPluginUtils::GetLabel(loc, &doc.GetScope());
            LOG_POST(Error << "Error processing location " << str);
        }
#endif
    }
    // update all views
    //CDocManager::UpdateAllViews();
    //
    // prepare our dialog box
    //
    m_Dialog->SetLabel(string("A search for restriction sites for ")
                       + NStr::IntToString(enzymes.size())
                       + " enzymes produced:");
    m_Dialog->Show();
    reply.SetStatus(eMessageStatus_success);
    reply.AddAction(CPluginReplyAction::e_Add_to_document);
}
void 
CAlgoPlugin_RestrictionSites::x_LoadREnzymeData(vector<CREnzyme>& enzymes,
                                     CRebase::EEnzymesToLoad which_enzymes)
{
    CNcbiApplication* app = CNcbiApplication::Instance();
    _ASSERT(app);
    CNcbiRegistry& registry = app->GetConfig();
    string fname;
    // By default the rebase "NAR format" file 
    // is assumed to be <std>/etc/rebase.nar.
    // This can be overridden in gbench.ini, via the application registry
    // variable [RESTRICTION_SITES] RebaseData.
    fname = registry.GetString("RESTRICTION_SITES", "RebaseData", "");
    if ( !fname.empty() ) {
        fname += ", ";
    }
    fname += "<home>/etc/rebase.nar, <std>/etc/rebase.nar";
    fname = CSystemPath::ResolvePathExisting(fname);
    if ( fname.empty() ) {
        throw runtime_error("Couldn't open REBASE file ");
    }
    ifstream refile(fname.c_str());
    CRebase::ReadNARFormat(refile, enzymes, which_enzymes);
}
// helper function to decode a parameter
CRebase::EEnzymesToLoad
CAlgoPlugin_RestrictionSites::x_DecodeWhichEnzymes(const string& s)
{
    if (s == "commercially available") {
        return CRebase::eCommercial;
    } else if (s == "prototypes") {
        return CRebase::ePrototype;
    } else if (s == "all") {
        return CRebase::eAll;
    } else {
        throw runtime_error(string("Invalid "which_enzymes" string: ")
                            + s);
    }
}
END_NCBI_SCOPE
/*
 * ===========================================================================
 * $Log: restriction_sites.cpp,v $
 * Revision 1000.5  2004/06/01 20:55:43  gouriano
 * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.32
 *
 * Revision 1.32  2004/05/21 22:27:47  gorelenk
 * Added PCH ncbi_pch.hpp
 *
 * Revision 1.31  2004/05/03 13:05:42  dicuccio
 * gui/utils --> gui/objutils where needed
 *
 * Revision 1.30  2004/03/05 17:35:37  dicuccio
 * Use sequence::GetId() instead of CSeq_id::GetStringDescr()
 *
 * Revision 1.29  2004/02/17 20:35:25  rsmith
 * moved core/settings.[ch]pp and core/system_path.[ch]pp to config and utils, respectively.
 *
 * Revision 1.28  2004/01/27 18:38:09  dicuccio
 * Code clean-up.  Use standard names for plugins.  Removed unnecessary #includes
 *
 * Revision 1.27  2004/01/07 15:50:39  dicuccio
 * Adjusted for API change in CPluginUtils::GetLabel().  Standardized exception
 * reporting in algorithms.
 *
 * Revision 1.26  2003/12/31 15:36:08  grichenk
 * Moved CompareLocations() from CSeq_feat to CSeq_loc,
 * renamed it to Compare().
 *
 * Revision 1.25  2003/11/24 15:45:28  dicuccio
 * Renamed CVersion to CPluginVersion
 *
 * Revision 1.24  2003/11/18 17:48:38  dicuccio
 * Added standard processing of return values
 *
 * Revision 1.23  2003/11/04 17:49:23  dicuccio
 * Changed calling parameters for plugins - pass CPluginMessage instead of paired
 * CPluginCommand/CPluginReply
 *
 * Revision 1.22  2003/10/27 17:46:49  dicuccio
 * Removed dead #includes
 *
 * Revision 1.21  2003/10/17 19:10:42  dicuccio
 * Code clean-up.  Refactored feature building into a more modular function.
 * Fixed list sorting - replaced with sorting of a vector to work around lack of
 * support for list::sort(functor) in MSVC
 *
 * Revision 1.20  2003/10/17 18:07:47  jcherry
 * Sort Seq-locs in mix by location.  Remap features for possible sites too.
 *
 * Revision 1.19  2003/10/15 13:40:26  dicuccio
 * Mkae sure to set the 'id' for the seq-locs before calling RemapChildToParent()
 *
 * Revision 1.18  2003/10/14 16:24:02  dicuccio
 * Added correct remapping of scanned locations to the parent location.  Cleaned
 * up code to look for data file - added hierarchichal search through path in INI,
 * user's home directory, and finally system installed path.
 *
 * Revision 1.17  2003/10/07 13:47:00  dicuccio
 * Renamed CPluginURL* to CPluginValue*
 *
 * Revision 1.16  2003/09/25 17:21:35  jcherry
 * Added name to annot
 *
 * Revision 1.15  2003/09/04 14:05:24  dicuccio
 * Use IDocument instead of CDocument
 *
 * Revision 1.14  2003/09/03 14:46:53  rsmith
 * change namespace name from args to plugin_args to avoid clashes with variable names.
 *
 * Revision 1.13  2003/08/22 15:47:45  dicuccio
 * Added 'objects::' where necessary
 *
 * Revision 1.12  2003/08/21 19:24:16  jcherry
 * Moved restriction site finding to algo/sequence
 *
 * Revision 1.11  2003/08/21 18:38:32  jcherry
 * Overloaded CFindRSites::Find to take several sequence containers.
 * Added option to lump together enzymes with identical specificities.
 *
 * Revision 1.10  2003/08/21 12:03:07  dicuccio
 * Make use of new typedef in plugin_utils.hpp for argument values.
 *
 * Revision 1.9  2003/08/20 22:57:44  jcherry
 * Reimplemented restriction site finding using finite state machine
 *
 * Revision 1.8  2003/08/18 19:24:15  jcherry
 * Moved orf and seq_match to algo/sequence
 *
 * Revision 1.7  2003/08/15 16:57:17  jcherry
 * For consecutive enzymes with identical specificities, reuse
 * search results.  This saves a bunch of time.
 *
 * Revision 1.6  2003/08/15 16:33:40  jcherry
 * Fixed typo
 *
 * Revision 1.5  2003/08/15 15:26:12  jcherry
 * Changed so that restriction site searching (CFindRSites::Find) returns
 * a vector of CRefs rather than a vector of objects.  This speeds sorting.
 *
 * Revision 1.4  2003/08/14 18:33:32  jcherry
 * Use an rsite feature type (rather than a region).
 * Also include cleavage sites in feature location.
 *
 * Revision 1.3  2003/08/13 17:40:26  dicuccio
 * Formatting fixes.  Changes some pass-by-val to pass-by-reference.  Fixed
 * complement table
 *
 * Revision 1.2  2003/08/13 12:37:58  dicuccio
 * Partial compilation fixes for Windows
 *
 * Revision 1.1  2003/08/12 18:52:58  jcherry
 * Initial version
 *
 * ===========================================================================
 */