restriction_sites.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:23k
- /*
- * ===========================================================================
- * PRODUCTION $Log: restriction_sites.cpp,v $
- * PRODUCTION Revision 1000.5 2004/06/01 20:55:43 gouriano
- * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.32
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: restriction_sites.cpp,v 1000.5 2004/06/01 20:55:43 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Authors: Josh Cherry
- *
- * File Description: gbench plugin for finding restriction sites
- *
- */
- #include <ncbi_pch.hpp>
- #include "restriction_sites.hpp"
- #include "rebase.hpp"
- #include <algo/sequence/restriction.hpp>
- #include <algo/sequence/seq_match.hpp>
- #include <algorithm>
- #include <corelib/ncbiapp.hpp>
- #include <corelib/ncbireg.hpp>
- #include <gui/core/plugin_utils.hpp>
- #include <gui/utils/system_path.hpp>
- #include <gui/core/version.hpp>
- #include <gui/dialogs/col/multi_col_dlg.hpp>
- #include <gui/plugin/PluginCommandSet.hpp>
- #include <gui/plugin/PluginInfo.hpp>
- #include <gui/plugin/PluginRequest.hpp>
- #include <gui/plugin/PluginValueConstraint.hpp>
- #include <gui/utils/message_box.hpp>
- #include <gui/objutils/utils.hpp>
- #include <objects/general/Dbtag.hpp>
- #include <objects/general/Object_id.hpp>
- #include <objects/seqfeat/Rsite_ref.hpp>
- #include <objects/seqloc/Seq_point.hpp>
- #include <objmgr/seq_vector.hpp>
- #include <objmgr/util/sequence.hpp>
- BEGIN_NCBI_SCOPE
- CAlgoPlugin_RestrictionSites::~CAlgoPlugin_RestrictionSites()
- {
- }
- // standard plugin announce bopilerplate
- void CAlgoPlugin_RestrictionSites::GetInfo(CPluginInfo& info)
- {
- info.Reset();
-
- // version info macro
- info.SetInfo(CPluginVersion::eMajor, CPluginVersion::eMinor, 0,
- string(__DATE__) + " " + string(__TIME__),
- "CAlgoPlugin_RestrictionSites", "Search/Find restriction sites",
- "Search a DNA sequence for restriction sites",
- "");
- // command info
- CPluginCommandSet& cmds = info.SetCommands();
- CPluginCommand& args = cmds.AddAlgoCommand(eAlgoCommand_run);
- args.AddArgument("locs", "Locations to evaluate",
- CSeq_loc::GetTypeInfo(),
- CPluginArg::TData::e_Array);
- args.SetConstraint("locs",
- (*CPluginValueConstraint::CreateSeqMol(),
- CSeq_inst::eMol_na,
- CSeq_inst::eMol_dna,
- CSeq_inst::eMol_rna));
- args.AddDefaultArgument("which_enzymes", "Which restriction enzymes",
- CPluginArg::eString, "commercially available");
- args.SetConstraint("which_enzymes", (*CPluginValueConstraint::CreateSet(),
- "commercially available",
- "prototypes", "all"));
- args.AddDefaultArgument("combine_isoschizomers",
- "Combine isoschizomers",
- CPluginArg::eBoolean, "true");
- args.AddArgument("sort_by_number_of_sites",
- "Sort results by number of (definite) cuts by enzyme",
- CPluginArg::eBoolean);
- }
- // helper functor for sorting CRef<CREnzResult> by the enzyme name
- struct SEnzymeNameCompare
- {
- bool operator()
- (const CRef<CREnzResult>& lhs, const CRef<CREnzResult>& rhs) const
- {
- return lhs->GetEnzymeName() < rhs->GetEnzymeName();
- }
- };
- // helper functor for sorting CREnzyme by the enzyme name
- struct SNameCompare
- {
- bool operator()
- (const CREnzyme& lhs, const CREnzyme& rhs) const
- {
- return lhs.GetName() < rhs.GetName();
- }
- };
- // helper functor for sorting by the number of definite sites
- struct SLessDefSites
- {
- bool operator()
- (const CRef<CREnzResult>& lhs, const CRef<CREnzResult>& rhs) const
- {
- return lhs->GetDefiniteSites().size() < rhs->GetDefiniteSites().size();
- }
- };
- // helper functor for sorting CRef<CSeq_loc>s by location
- struct SLessSeq_loc
- {
- bool operator()
- (const CRef<CSeq_loc>& lhs, const CRef<CSeq_loc>& rhs) const
- {
- return (lhs->Compare(*rhs) < 0);
- }
- };
- static
- void s_AddSitesToAnnot(const vector<CRSite>& sites,
- const CREnzResult& result,
- CSeq_annot& annot,
- const CSeq_loc& parent_loc,
- bool definite = true)
- {
- const CSeq_id& id = sequence::GetId(parent_loc);
- ITERATE (vector<CRSite>, site, sites) {
- // create feature
- CRef<CSeq_feat> feat(new CSeq_feat());
- // start to set up Rsite
- feat->SetData().SetRsite().SetDb().SetDb("REBASE");
- feat->SetData().SetRsite().SetDb()
- .SetTag().SetStr("REBASE");
- string str(result.GetEnzymeName());
- if ( !definite ) {
- str = "Possible " + str;
- }
- feat->SetData().SetRsite().SetStr(str);
- //
- // build our location
- //
- vector< CRef<CSeq_loc> > locs;
- // a loc for the recognition site
- CRef<CSeq_loc> recog_site(new CSeq_loc);
- recog_site->SetInt().SetFrom(site->GetStart());
- recog_site->SetInt().SetTo (site->GetEnd());
- recog_site->SetId(id);
- locs.push_back(recog_site);
- // locs for the cleavage sites
- int negative_cut_locs = 0; // count these exceptions
- ITERATE (vector<int>, cut, site->GetPlusCuts()) {
- if (*cut >= 0 ) {
- CRef<CSeq_loc> cut_site(new CSeq_loc);
- cut_site->SetPnt().SetPoint(*cut);
- // indicate that the cut is to the "left"
- cut_site->SetPnt()
- .SetFuzz().SetLim(CInt_fuzz::eLim_tl);
- cut_site->SetPnt().SetStrand(eNa_strand_plus);
- cut_site->SetId(id);
- locs.push_back(cut_site);
- } else {
- negative_cut_locs++;
- }
- }
- ITERATE (vector<int>, cut, site->GetMinusCuts()) {
- if (*cut >= 0 ) {
- CRef<CSeq_loc> cut_site(new CSeq_loc);
- cut_site->SetPnt().SetPoint(*cut);
- // indicate that the cut is to the "left"
- cut_site->SetPnt()
- .SetFuzz().SetLim(CInt_fuzz::eLim_tl);
- cut_site->SetPnt().SetStrand(eNa_strand_minus);
- cut_site->SetId(id);
- locs.push_back(cut_site);
- } else {
- negative_cut_locs++;
- }
- }
- // comment for those few cases where there are
- // cuts before the sequence begins
- if (negative_cut_locs > 0) {
- string a_comm = NStr::IntToString(negative_cut_locs)
- + " cleavage sites are located before the"
- " beginning of the sequence and are not reported";
- feat->SetComment(a_comm);
- }
- sort(locs.begin(), locs.end(), SLessSeq_loc());
- copy(locs.begin(), locs.end(),
- back_inserter(feat->SetLocation().SetMix().Set()));
- feat->SetLocation
- (*CSeqUtils::RemapChildToParent(parent_loc, feat->GetLocation()));
- // save in annot
- annot.SetData().SetFtable().push_back(feat);
- }
- }
- void CAlgoPlugin_RestrictionSites::RunCommand(CPluginMessage& msg)
- {
- const CPluginCommand& args = msg.GetRequest().GetCommand();
- CPluginReply& reply = msg.SetReply();
- _TRACE("CAlgoPlugin_RestrictionSites::RunCommand()");
-
- // load patterns from file
- CRebase::EEnzymesToLoad which_enzymes
- = x_DecodeWhichEnzymes(args["which_enzymes"].AsString());
- vector<CREnzyme> enzymes;
- try {
- x_LoadREnzymeData(enzymes, which_enzymes);
- }
- catch (const exception& e) {
- NcbiMessageBox(e.what());
- reply.SetStatus(eMessageStatus_failed);
- return;
- }
- // optionally lump together all enzymes with identical specificities
- if (args["combine_isoschizomers"].AsBoolean()) {
- // first sort alphabetically by enzyme name
- sort(enzymes.begin(), enzymes.end(), SNameCompare());
- // now combine isoschizomers
- CREnzyme::CombineIsoschizomers(enzymes);
- }
- if ( !m_Dialog.get() ) {
- m_Dialog.reset(new CMultiColDlg());
- m_Dialog->SetWindowSize(1000, 500);
- m_Dialog->SetTitle("Restriction Sites");
-
- m_Dialog->SetColumn(0, "Sequence", FL_ALIGN_LEFT, 0.3f);
- m_Dialog->SetColumn(1, "Location", FL_ALIGN_LEFT, 0.5f);
- m_Dialog->SetColumn(2, "Enzyme", FL_ALIGN_LEFT, 0.4f);
- m_Dialog->SetColumn(3, "Number of Sites", FL_ALIGN_CENTER, 0.5f);
- m_Dialog->SetColumn(4, "Recog. Site Loc.", FL_ALIGN_CENTER, 0.75f);
- m_Dialog->SetColumn(5, "Plus Strand Cuts", FL_ALIGN_CENTER, 0.75f);
- m_Dialog->SetColumn(6, "Minus Strand Cuts", FL_ALIGN_CENTER, 0.75f);
- }
- m_Dialog->SetRows(0); // to clear any previous contents
- int row = 0;
- plugin_args::TLocList locs;
- GetArgValue(args["locs"], locs);
- ITERATE (plugin_args::TLocList, iter, locs) {
- const CSeq_loc& loc = *iter->second;
- const IDocument& doc = *iter->first;
- // find the best ID for this bioseq
- try {
- CBioseq_Handle handle = doc.GetScope().GetBioseqHandle(loc);
- // get sequence in binary (8na) form
- CSeqVector vec =
- handle.GetSequenceView(loc,
- CBioseq_Handle::eViewConstructed,
- CBioseq_Handle::eCoding_Ncbi);
- string& id_str = m_Dialog->SetCell(row, 0);
- string& loc_str = m_Dialog->SetCell(row, 1);
- const CSeq_id& best_id =
- sequence::GetId(handle, sequence::eGetId_Best);
- id_str.erase();
- best_id.GetLabel(&id_str);
- loc_str = CPluginUtils::GetLabel(loc, &doc.GetScope());
- // a new feature table
- CRef<CSeq_annot> annot(new CSeq_annot());
- // a place to store results (one per enzyme)
- typedef vector<CRef<CREnzResult> > TResults;
- TResults results;
- // do the big search
- CFindRSites::Find(vec, enzymes, results);
- // add description to annot
- annot->SetName("Restriction sites");
- // deal with stored enzyme results
- sort(results.begin(), results.end(), SEnzymeNameCompare());
- if (args["sort_by_number_of_sites"].AsBoolean()) {
- // do a stablesort so alphabetical order preserved
- // within sets with the same number of sites
- stable_sort(results.begin(), results.end(), SLessDefSites());
- }
- // count the number of definite and possible sites
- int total_definite_sites = 0, total_possible_sites = 0;
- int total_non_cutters = 0;
- ITERATE (TResults, result, results) {
- total_definite_sites += (*result)->GetDefiniteSites().size();
- total_possible_sites += (*result)->GetPossibleSites().size();
- if ((*result)->GetDefiniteSites().size()
- + (*result)->GetPossibleSites().size() == 0) {
- total_non_cutters++;
- }
- }
- _TRACE("Found " << total_definite_sites << " definite and "
- << total_possible_sites << " possible sites");
- // in order to build dialog efficiently,
- // pre-allocate rows
- int total_rows = total_definite_sites + total_possible_sites
- + total_non_cutters;
- m_Dialog->SetRows(row + total_rows);
- ITERATE (TResults, result, results) {
-
- //
- // add sites to dialog
- //
- int starting_row = row;
- m_Dialog->SetCell(row, 2) = (*result)->GetEnzymeName();
- string num_sites =
- NStr::IntToString((*result)->GetDefiniteSites().size());
- if (!(*result)->GetPossibleSites().empty()) {
- num_sites += "+";
- num_sites +=
- NStr::IntToString((*result)
- ->GetPossibleSites().size());
- num_sites += "?";
- }
- m_Dialog->SetCell(row, 3) = num_sites;
- const vector<CRSite>& definite_sites
- = (*result)->GetDefiniteSites();
- ITERATE (vector<CRSite>, site, definite_sites) {
- string& pos_str = m_Dialog->SetCell(row, 4);
- pos_str = NStr::IntToString(site->GetStart() + 1) + " - "
- + NStr::IntToString(site->GetEnd() + 1);
- string plus_cuts;
- const vector<int>& pcuts = site->GetPlusCuts();
- ITERATE (vector<int>, cut, pcuts) {
- if (!plus_cuts.empty()) {
- plus_cuts += ", ";
- }
- plus_cuts += NStr::IntToString(*cut);
- }
- m_Dialog->SetCell(row, 5) = plus_cuts;
- string minus_cuts;
- const vector<int>& mcuts = site->GetMinusCuts();
- ITERATE (vector<int>, cut, mcuts) {
- if (!minus_cuts.empty()) {
- minus_cuts += ", ";
- }
- minus_cuts += NStr::IntToString(*cut);
- }
- m_Dialog->SetCell(row, 6) = minus_cuts;
- ++row;
- }
- const vector<CRSite>& possible_sites
- = (*result)->GetPossibleSites();
- ITERATE (vector<CRSite>, site, possible_sites) {
- string& pos_str = m_Dialog->SetCell(row, 4);
- pos_str = string("???")
- + NStr::IntToString(site->GetStart() + 1) + " - "
- + NStr::IntToString(site->GetEnd() + 1);
- string plus_cuts;
- const vector<int>& pcuts = site->GetPlusCuts();
- ITERATE (vector<int>, cut, pcuts) {
- if (!plus_cuts.empty()) {
- plus_cuts += ", ";
- }
- plus_cuts += NStr::IntToString(*cut);
- }
- m_Dialog->SetCell(row, 5) = plus_cuts;
- string minus_cuts;
- const vector<int>& mcuts = site->GetMinusCuts();
- ITERATE (vector<int>, cut, mcuts) {
- if (!minus_cuts.empty()) {
- minus_cuts += ", ";
- }
- minus_cuts += NStr::IntToString(*cut);
- }
- m_Dialog->SetCell(row, 6) = minus_cuts;
- ++row;
- }
- if (row == starting_row) {
- row++;
- }
- //
- // add features to annot
- //
- s_AddSitesToAnnot(definite_sites, **result, *annot, loc);
- s_AddSitesToAnnot(possible_sites, **result, *annot, loc, false);
- }
- // attach annot to doc
- //const_cast<IDocument&>(doc).AttachAnnot(*annot);
- //const_cast<IDocument&>(doc).UpdateAllViews(fDocumentChanged);
- reply.AddObject(doc, *annot);
- }
- catch (exception& e) {
- LOG_POST(Error << e.what());
- string str = CPluginUtils::GetLabel(loc, &doc.GetScope());
- LOG_POST(Error << "Error processing location " << str);
- }
- #ifndef _DEBUG
- catch (...) {
- string str = CPluginUtils::GetLabel(loc, &doc.GetScope());
- LOG_POST(Error << "Error processing location " << str);
- }
- #endif
- }
- // update all views
- //CDocManager::UpdateAllViews();
- //
- // prepare our dialog box
- //
- m_Dialog->SetLabel(string("A search for restriction sites for ")
- + NStr::IntToString(enzymes.size())
- + " enzymes produced:");
- m_Dialog->Show();
- reply.SetStatus(eMessageStatus_success);
- reply.AddAction(CPluginReplyAction::e_Add_to_document);
- }
- void
- CAlgoPlugin_RestrictionSites::x_LoadREnzymeData(vector<CREnzyme>& enzymes,
- CRebase::EEnzymesToLoad which_enzymes)
- {
- CNcbiApplication* app = CNcbiApplication::Instance();
- _ASSERT(app);
- CNcbiRegistry& registry = app->GetConfig();
- string fname;
- // By default the rebase "NAR format" file
- // is assumed to be <std>/etc/rebase.nar.
- // This can be overridden in gbench.ini, via the application registry
- // variable [RESTRICTION_SITES] RebaseData.
- fname = registry.GetString("RESTRICTION_SITES", "RebaseData", "");
- if ( !fname.empty() ) {
- fname += ", ";
- }
- fname += "<home>/etc/rebase.nar, <std>/etc/rebase.nar";
- fname = CSystemPath::ResolvePathExisting(fname);
- if ( fname.empty() ) {
- throw runtime_error("Couldn't open REBASE file ");
- }
- ifstream refile(fname.c_str());
- CRebase::ReadNARFormat(refile, enzymes, which_enzymes);
- }
- // helper function to decode a parameter
- CRebase::EEnzymesToLoad
- CAlgoPlugin_RestrictionSites::x_DecodeWhichEnzymes(const string& s)
- {
- if (s == "commercially available") {
- return CRebase::eCommercial;
- } else if (s == "prototypes") {
- return CRebase::ePrototype;
- } else if (s == "all") {
- return CRebase::eAll;
- } else {
- throw runtime_error(string("Invalid "which_enzymes" string: ")
- + s);
- }
- }
- END_NCBI_SCOPE
- /*
- * ===========================================================================
- * $Log: restriction_sites.cpp,v $
- * Revision 1000.5 2004/06/01 20:55:43 gouriano
- * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.32
- *
- * Revision 1.32 2004/05/21 22:27:47 gorelenk
- * Added PCH ncbi_pch.hpp
- *
- * Revision 1.31 2004/05/03 13:05:42 dicuccio
- * gui/utils --> gui/objutils where needed
- *
- * Revision 1.30 2004/03/05 17:35:37 dicuccio
- * Use sequence::GetId() instead of CSeq_id::GetStringDescr()
- *
- * Revision 1.29 2004/02/17 20:35:25 rsmith
- * moved core/settings.[ch]pp and core/system_path.[ch]pp to config and utils, respectively.
- *
- * Revision 1.28 2004/01/27 18:38:09 dicuccio
- * Code clean-up. Use standard names for plugins. Removed unnecessary #includes
- *
- * Revision 1.27 2004/01/07 15:50:39 dicuccio
- * Adjusted for API change in CPluginUtils::GetLabel(). Standardized exception
- * reporting in algorithms.
- *
- * Revision 1.26 2003/12/31 15:36:08 grichenk
- * Moved CompareLocations() from CSeq_feat to CSeq_loc,
- * renamed it to Compare().
- *
- * Revision 1.25 2003/11/24 15:45:28 dicuccio
- * Renamed CVersion to CPluginVersion
- *
- * Revision 1.24 2003/11/18 17:48:38 dicuccio
- * Added standard processing of return values
- *
- * Revision 1.23 2003/11/04 17:49:23 dicuccio
- * Changed calling parameters for plugins - pass CPluginMessage instead of paired
- * CPluginCommand/CPluginReply
- *
- * Revision 1.22 2003/10/27 17:46:49 dicuccio
- * Removed dead #includes
- *
- * Revision 1.21 2003/10/17 19:10:42 dicuccio
- * Code clean-up. Refactored feature building into a more modular function.
- * Fixed list sorting - replaced with sorting of a vector to work around lack of
- * support for list::sort(functor) in MSVC
- *
- * Revision 1.20 2003/10/17 18:07:47 jcherry
- * Sort Seq-locs in mix by location. Remap features for possible sites too.
- *
- * Revision 1.19 2003/10/15 13:40:26 dicuccio
- * Mkae sure to set the 'id' for the seq-locs before calling RemapChildToParent()
- *
- * Revision 1.18 2003/10/14 16:24:02 dicuccio
- * Added correct remapping of scanned locations to the parent location. Cleaned
- * up code to look for data file - added hierarchichal search through path in INI,
- * user's home directory, and finally system installed path.
- *
- * Revision 1.17 2003/10/07 13:47:00 dicuccio
- * Renamed CPluginURL* to CPluginValue*
- *
- * Revision 1.16 2003/09/25 17:21:35 jcherry
- * Added name to annot
- *
- * Revision 1.15 2003/09/04 14:05:24 dicuccio
- * Use IDocument instead of CDocument
- *
- * Revision 1.14 2003/09/03 14:46:53 rsmith
- * change namespace name from args to plugin_args to avoid clashes with variable names.
- *
- * Revision 1.13 2003/08/22 15:47:45 dicuccio
- * Added 'objects::' where necessary
- *
- * Revision 1.12 2003/08/21 19:24:16 jcherry
- * Moved restriction site finding to algo/sequence
- *
- * Revision 1.11 2003/08/21 18:38:32 jcherry
- * Overloaded CFindRSites::Find to take several sequence containers.
- * Added option to lump together enzymes with identical specificities.
- *
- * Revision 1.10 2003/08/21 12:03:07 dicuccio
- * Make use of new typedef in plugin_utils.hpp for argument values.
- *
- * Revision 1.9 2003/08/20 22:57:44 jcherry
- * Reimplemented restriction site finding using finite state machine
- *
- * Revision 1.8 2003/08/18 19:24:15 jcherry
- * Moved orf and seq_match to algo/sequence
- *
- * Revision 1.7 2003/08/15 16:57:17 jcherry
- * For consecutive enzymes with identical specificities, reuse
- * search results. This saves a bunch of time.
- *
- * Revision 1.6 2003/08/15 16:33:40 jcherry
- * Fixed typo
- *
- * Revision 1.5 2003/08/15 15:26:12 jcherry
- * Changed so that restriction site searching (CFindRSites::Find) returns
- * a vector of CRefs rather than a vector of objects. This speeds sorting.
- *
- * Revision 1.4 2003/08/14 18:33:32 jcherry
- * Use an rsite feature type (rather than a region).
- * Also include cleavage sites in feature location.
- *
- * Revision 1.3 2003/08/13 17:40:26 dicuccio
- * Formatting fixes. Changes some pass-by-val to pass-by-reference. Fixed
- * complement table
- *
- * Revision 1.2 2003/08/13 12:37:58 dicuccio
- * Partial compilation fixes for Windows
- *
- * Revision 1.1 2003/08/12 18:52:58 jcherry
- * Initial version
- *
- * ===========================================================================
- */