genemark_loader.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:18k
- /*
- * ===========================================================================
- * PRODUCTION $Log: genemark_loader.cpp,v $
- * PRODUCTION Revision 1000.5 2004/06/01 20:58:40 gouriano
- * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.35
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: genemark_loader.cpp,v 1000.5 2004/06/01 20:58:40 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Authors: Dmitry Dernovoy
- *
- * File Description:
- * CGeneMarkLoader - Plugin to load GeneMark's predictions
- */
- #include <ncbi_pch.hpp>
- #include "genemark_loader.hpp"
- #include <gui/core/idocument.hpp>
- #include <gui/core/version.hpp>
- #include <gui/dialogs/file_browser.hpp>
- #include <gui/plugin/PluginCommandSet.hpp>
- #include <gui/plugin/PluginInfo.hpp>
- #include <gui/plugin/PluginValue.hpp>
- #include <objects/general/Int_fuzz.hpp>
- #include <objects/general/Object_id.hpp>
- #include <objects/seqfeat/Cdregion.hpp>
- #include <objects/seqfeat/Feat_id.hpp>
- #include <objects/seqfeat/Genetic_code.hpp>
- #include <objects/seqfeat/SeqFeatData.hpp>
- #include <objects/seqfeat/Seq_feat.hpp>
- #include <objects/seqloc/Seq_interval.hpp>
- #include <objects/seqloc/Seq_loc.hpp>
- #define GENEMARK_MAXLINE 200
- BEGIN_NCBI_SCOPE
- void CGeneMarkLoader::GetInfo(CPluginInfo& info)
- {
- info.Reset();
- // version info macro
- info.SetInfo(CPluginVersion::eMajor, CPluginVersion::eMinor, 0,
- string(__DATE__) + " " + string(__TIME__),
- "CGeneMarkLoader",
- "GeneMark\/Glimmer output",
- "Load the results from a GeneMark/Glimmer run", "");
- // command info
- CPluginCommandSet& cmds = info.SetCommands();
- CPluginCommand& import_args = cmds.AddDataCommand(eDataCommand_import);
- import_args.AddArgument("document", "Document", CPluginArg::eDocument);
- }
- CGeneMarkLoader::CGeneMarkLoader()
- {
- }
- CGeneMarkLoader::~CGeneMarkLoader()
- {
- }
- void CGeneMarkLoader::Import(CPluginMessage& msg)
- {
- const CPluginCommand& args = msg.GetRequest().GetCommand();
- CPluginReply& reply = msg.SetReply();
- reply.SetStatus(eMessageStatus_failed);
- LOG_POST(Info << "CGeneMarkLoader::Load: start point.");
- IDocument* doc = const_cast<IDocument*> (&args["document"].AsDocument());
- if ( !doc ) {
- reply.SetStatus(eMessageStatus_failed);
- return;
- }
- try {
- LOG_POST(Info << "CGeneMarkLoader::Load: reading file...");
- CConstRef<CSeq_id> id(dynamic_cast<const CSeq_id*> (doc->GetObject()));
- if ( !id ) {
- _TRACE("CGeneMarkLoader::Load: can't get any reasonable seq_id");
- return;
- }
- string acc_text = id->GetSeqIdString(false) ;
- // LOG_POST( Info << "CGeneMarkLoader:: accession: " << acc_text );
- // _TRACE("CGeneMarkLoader:: accession: " << acc_text );
- string tmp_str = string("(") + NStr::ToUpper(acc_text) + string(".fna.g*)"); //"(*.{gmHMM,gmark,glim_*})"
- string fname =
- NcbiFileBrowser("Open GeneMark/GeneMark.hmm/Glimmer output file...",
- tmp_str.c_str(),
- #if defined(NCBI_OS_MSWIN)
- "\\Atlas\b11\tatiana\Predictions\");
- #else
- "/net/atlas/b11/tatiana/Predictions/");
- #endif
- if ( fname.empty() ) {
- reply.SetStatus(eMessageStatus_ignored);
- return;
- }
- CBioseq_Handle handle = doc->GetScope().GetBioseqHandle(*id);
- if ( !handle ) {
- _TRACE("CGeneMarkLoader::Load: can't get bioseq handle");
- return;
- }
- CRef<CSeq_annot> annot;
- switch(x_RecognizeFormat(fname))
- {
- case fGeneMark:
- annot = x_LoadGeneMarkFile(fname, *id);
- break;
- case fGeneMarkHMM:
- annot = x_LoadGeneMarkHmmFile(fname, *id);
- break;
- case fGlimmer2:
- annot = x_LoadGlimmer2File(fname, *id);
- break;
- default:
- _TRACE("Unknown return code");
- case fUnknownFormat:
- _TRACE("Unsupported file format, file: "<< fname);
- reply.SetStatus(eMessageStatus_failed);
- return;
- }
- // save the object in the reply for framework processing
- reply.AddObject(*doc, *annot);
- reply.AddAction(CPluginReplyAction::e_Add_to_document);
- reply.SetStatus(eMessageStatus_success);
- }
- catch (CException& e) {
- LOG_POST(Info << "failed to read GeneMark file: " << e.what());
- _TRACE("failed to read GeneMark file: " << e.what() );
- }
- #ifndef _DEBUG
- catch (...) {
- _TRACE("failed to read GeneMark file: unknown error");
- }
- #endif
- }
- CGeneMarkLoader::EFileFormat
- CGeneMarkLoader::x_RecognizeFormat(const string& fname)
- {
- CNcbiIfstream istr( fname.c_str() );
- char buf[ GENEMARK_MAXLINE +1 ];
- istr.getline(buf, GENEMARK_MAXLINE);
- if( strstr(buf,"r=-1.") || strstr(buf,"r=-0.") ) return fGlimmer2;
- for(int i=1; (i < 30) && !istr.eof() ; ++i)
- {
- istr.getline(buf, GENEMARK_MAXLINE);
- if(strstr(buf, "Strand"))
- {
- if(strstr(buf,"Frame")) return fGeneMark;
- if(strstr(buf,"RightEnd")) return fGeneMarkHMM;
- }
- }
- return fUnknownFormat;
- }
- CRef<CSeq_annot>
- CGeneMarkLoader::x_LoadGlimmer2File(const string& fname,
- const CSeq_id& id)
- {
- CNcbiIfstream istr(fname.c_str());
- CRef<CSeq_annot> annot( new CSeq_annot() );
- annot->AddName("Glimmer2 predictions");
- list< CRef<CSeq_feat> >& ftable = annot->SetData().SetFtable();
- char buf[ GENEMARK_MAXLINE +1 ];
- while( istr.getline(buf, GENEMARK_MAXLINE) )
- {
- CNcbiIstrstream Lstr( buf );
- int gene_number = 0, glim_from = 0, glim_to = 0;
- Lstr >> gene_number;
- Lstr >> glim_from;
- Lstr >> glim_to;
- char strand = 'U';
- for( ; (strand != '-') && (strand != '+'); Lstr >> strand );
- TSeqPos LeftEnd, RightEnd;
- if(strand == '-')
- {
- LeftEnd = glim_to - 3;
- RightEnd = glim_from;
- }else{ // direct
- LeftEnd = glim_from;
- RightEnd = glim_to + 3;
- }
- if(LeftEnd > RightEnd) // Glimmer prediction over zero-point
- {
- // can't handle yet
- LOG_POST(Info << "CGeneMarkLoader::Can't handle Glimmer's prediction over zero-point, skipped.");
- continue;
- }
- CRef<CSeq_feat> feat(new CSeq_feat());
- feat->SetComment() = "Glimmer2 pred #" + NStr::IntToString(gene_number) ;
- CSeq_interval& floc = feat->SetLocation().SetInt();
- floc.SetFrom(LeftEnd- 1);
- floc.SetTo(RightEnd - 1);
- floc.SetStrand((strand == '-') ? eNa_strand_minus : eNa_strand_plus);
- floc.SetId().Assign(id); // floc.SetId().SetGi( NStr::StringToInt(m_SeqId) );
- CSeqFeatData& fdata = feat->SetData();
- CCdregion& cdreg = fdata.SetCdregion();
- cdreg.SetFrame(CCdregion::eFrame_one);
- list< CRef< CGenetic_code::C_E > >& gcode = cdreg.SetCode().Set();
- CRef< CGenetic_code::C_E > ce(new CGenetic_code::C_E);
- ce->SetId(11); // TSE=1; seq=1; feat=1
- gcode.push_back(ce);
- ftable.push_back(feat);
- }
- return annot;
- }
- CRef<CSeq_annot>
- CGeneMarkLoader::x_LoadGeneMarkHmmFile(const string& fname,
- const CSeq_id& id)
- {
- CNcbiIfstream istr(fname.c_str());
- CRef<CSeq_annot> annot( new CSeq_annot() );
- annot->AddName("GeneMark.hmm predictions");
- list< CRef<CSeq_feat> >& ftable = annot->SetData().SetFtable();
- char buf[ GENEMARK_MAXLINE +1 ];
- while( istr.getline(buf, GENEMARK_MAXLINE) ) // find and skip file header
- {
- if(strstr(buf, "Strand") && strstr(buf,"RightEnd"))
- {
- istr.getline(buf, GENEMARK_MAXLINE); // skip another one line
- break;
- }
- }
- while( istr.getline(buf, GENEMARK_MAXLINE) )
- {
- CNcbiIstrstream Lstr( buf ); // LOG_POST(Info << "Parsing line:" << buf);
- int gene_number = 0;
- Lstr >> gene_number;
- char strand = 'U';
- for(; (strand != '-') && (strand != '+'); Lstr >> strand);
- string aLeftEnd;
- TSeqPos LeftEnd, RightEnd;
- Lstr >> aLeftEnd;
- Lstr >> RightEnd;
- if(aLeftEnd[0] == '<') aLeftEnd[0] = ' ';
- LeftEnd = NStr::StringToInt(aLeftEnd);
- // _TRACE("parsed line: "<< gene_number <<" from: "<< LeftEnd <<" to: "<< RightEnd <<" starnd: "<< strand);
- CRef<CSeq_feat> feat(new CSeq_feat);
- feat->SetComment() = "GeneMark.hmm pred #" + NStr::IntToString(gene_number) ;
- CSeq_interval& floc = feat->SetLocation().SetInt();
- floc.SetFrom(LeftEnd- 1);
- floc.SetTo(RightEnd - 1);
- floc.SetStrand((strand == '-') ? eNa_strand_minus : eNa_strand_plus);
- floc.SetId().Assign(id); // floc.SetId().SetGi( NStr::StringToInt(m_SeqId) );
- CSeqFeatData& fdata = feat->SetData();
- CCdregion& cdreg = fdata.SetCdregion();
- cdreg.SetFrame(CCdregion::eFrame_one);
- list< CRef< CGenetic_code::C_E > >& gcode = cdreg.SetCode().Set();
- CRef< CGenetic_code::C_E > ce(new CGenetic_code::C_E);
- ce->SetId(11); // TSE=1; seq=1; feat=1
- gcode.push_back(ce);
- ftable.push_back(feat);
- }
- return annot;
- }
- CRef<CSeq_annot>
- CGeneMarkLoader::x_LoadGeneMarkFile(const string& fname, const CSeq_id& id)
- {
- char buf[ GENEMARK_MAXLINE +1 ];
- CNcbiIfstream istr(fname.c_str());
- while(istr.getline(buf, GENEMARK_MAXLINE)) // find and skip file header
- {
- if(strstr(buf, "Strand") && strstr(buf,"Frame"))
- {
- istr.getline(buf, GENEMARK_MAXLINE); // skip ----- -----
- // istr.getline(buf, GENEMARK_MAXLINE); // sometimes NOT empty !
- break;
- }
- }
- CRef<CSeq_annot> annot( new CSeq_annot() );
- annot->AddName("GeneMark predictions");
- list< CRef<CSeq_feat> >& ftable = annot->SetData().SetFtable();
- TSeqPos prevStop = 0;
- int gene_number = 0;
- CRef<CSeq_feat> feat;
- CSeq_interval *floc;
- while( istr.getline(buf, GENEMARK_MAXLINE) )
- {
- if(strstr(buf, "interest")) break; // we have reached "List of Regions of interest"
- if(strlen(buf) < 10) continue;
- // LOG_POST(Info << "CGeneMarkLoader::Load: parsing line: " << buf );
- CNcbiIstrstream Lstr( buf );
- string strand;
- TSeqPos LeftEnd, RightEnd, curStop;
- Lstr >> LeftEnd;
- Lstr >> RightEnd;
- Lstr >> strand;
- bool is_complementary = (strand.find("complement") != string::npos);
- curStop = is_complementary ? LeftEnd : RightEnd;
- if(curStop != prevStop) // new lines-group
- {
- prevStop = curStop;
- ++gene_number;
- feat = new CSeq_feat();
- feat->SetComment() = "GeneMark pred #" + NStr::IntToString(gene_number) ;
- ftable.push_back(feat);
- CSeqFeatData& fdata = feat->SetData();
- CCdregion& cdreg = fdata.SetCdregion();
- cdreg.SetFrame(CCdregion::eFrame_one);
- list< CRef< CGenetic_code::C_E > >& gcode = cdreg.SetCode().Set();
- CRef< CGenetic_code::C_E > ce(new CGenetic_code::C_E);
- ce->SetId(11); // TSE=1; seq=1; feat=1
- gcode.push_back(ce);
- floc = & feat->SetLocation().SetInt();
- floc->SetFrom(LeftEnd-1);
- floc->SetTo(RightEnd -1);
- floc->SetStrand(is_complementary ? eNa_strand_minus : eNa_strand_plus);
- floc->SetId().Assign(id); // floc->SetId().SetGi( NStr::StringToInt(m_SeqId) );
- }
- else // another start of previous gene
- {
- if(is_complementary)
- {
- floc->SetFuzz_to().SetAlt().push_back( RightEnd -1);
- }
- else
- {
- floc->SetFuzz_from().SetAlt().push_back( LeftEnd -1); // LOG_POST(Info << "Left:"<<LeftEnd);
- }
- }
- }
- return annot;
- }
- END_NCBI_SCOPE
- /*
- * =====================================================================
- * $Log: genemark_loader.cpp,v $
- * Revision 1000.5 2004/06/01 20:58:40 gouriano
- * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.35
- *
- * Revision 1.35 2004/05/21 22:27:48 gorelenk
- * Added PCH ncbi_pch.hpp
- *
- * Revision 1.34 2004/03/11 17:44:00 dicuccio
- * Use new file loader dialog
- *
- * Revision 1.33 2003/12/22 20:29:47 dernovoy
- * skip Glimmer's prediction over zero-point
- *
- * Revision 1.32 2003/12/16 20:24:55 dernovoy
- * Path to precomputed predictions should work on both Windows and UNIX inside NCBI
- *
- * Revision 1.31 2003/12/10 22:53:24 dernovoy
- * defaults directory and file extensions added
- *
- * Revision 1.30 2003/12/09 23:22:14 dernovoy
- * menu string changed : coma instead of slash
- *
- * Revision 1.29 2003/12/09 21:46:00 dernovoy
- * Glimmer2 output loader was added
- *
- * Revision 1.28 2003/11/24 15:45:40 dicuccio
- * Renamed CVersion to CPluginVersion
- *
- * Revision 1.27 2003/11/18 17:49:26 dicuccio
- * Added standard processing of return values
- *
- * Revision 1.26 2003/11/04 17:49:25 dicuccio
- * Changed calling parameters for plugins - pass CPluginMessage instead of paired
- * CPluginCommand/CPluginReply
- *
- * Revision 1.25 2003/10/10 17:19:33 dicuccio
- * Added Import() interface. Removed dead Save() interfaces
- *
- * Revision 1.24 2003/10/07 13:47:06 dicuccio
- * Renamed CPluginURL* to CPluginValue*
- *
- * Revision 1.23 2003/09/17 16:27:28 dicuccio
- * Removed load command
- *
- * Revision 1.22 2003/09/04 14:51:59 dicuccio
- * Use IDocument instead of CDocument
- *
- * Revision 1.21 2003/07/14 11:17:25 shomrat
- * Plugin messageing system related changes
- *
- * Revision 1.20 2003/06/25 17:02:59 dicuccio
- * Split CPluginHandle into a handle (pointer-to-implementation) and
- * implementation file. Lots of #include file clean-ups.
- *
- * Revision 1.19 2003/06/20 14:52:58 dicuccio
- * Revised plugin registration - moved GetInfo() into the plugin handler
- *
- * Revision 1.18 2003/05/19 13:40:45 dicuccio
- * Moved gui/core/plugin/ -> gui/plugin/. Merged core libraries into libgui_core.
- * Removed old, unused dialog box.
- *
- * Revision 1.17 2003/04/24 16:39:29 dicuccio
- * Updated to reflect changes in plugin API
- *
- * Revision 1.16 2003/02/24 13:03:16 dicuccio
- * Renamed classes in plugin spec:
- * CArgSeg --> CPluginArgSet
- * CArgument --> CPluginArg
- * CPluginArgs --> CPluginCommand
- * CPluginCommands --> CPluginCommandSet
- *
- * Revision 1.15 2003/02/20 19:49:56 dicuccio
- * Created new plugin architecture, based on ASN.1 spec. Moved GBENCH frameowrk
- * over to use new plugin architecture.
- *
- * Revision 1.14 2003/02/06 18:48:36 dicuccio
- * Made 'catch (...)' conditional for non-debug builds
- *
- * Revision 1.13 2003/01/15 19:47:37 dernovoy
- * accession can be used for seq_id in features (was: only gi).
- *
- * Revision 1.12 2003/01/13 13:10:07 dicuccio
- * Namespace clean-up. Retired namespace gui -> converted all to namespace ncbi.
- * Moved all FLUID-generated code into namespace ncbi.
- *
- * Revision 1.11 2003/01/08 21:17:59 dernovoy
- * Feature's location fixed (from local to gi),
- * GeneMark coordinates translated to ncbi format (0 - (Len-1))
- *
- * Revision 1.10 2003/01/06 21:03:12 dernovoy
- * Support for Alternative starts (fuzz-from/to) of GeneMark's output
- *
- * Revision 1.9 2003/01/02 21:41:37 dernovoy
- * Load of genemark output added, the farthest starts taken for features
- *
- * Revision 1.8 2003/01/02 19:58:38 dernovoy
- * fix comparing stream with int
- *
- * Revision 1.7 2002/12/31 19:45:20 dernovoy
- * Addded support for genemarkHMM start positions started with symbol '<'
- *
- * Revision 1.6 2002/12/30 17:48:29 dicuccio
- * Added mechanism for data loader plugins to announce supported modes of
- * operation (load, import, save currently)
- *
- * Revision 1.5 2002/12/30 15:47:10 dernovoy
- * TRACE outputs added, any doc's updates comments out
- *
- * Revision 1.4 2002/12/26 20:50:25 dernovoy
- * *** empty log message ***
- *
- * Revision 1.3 2002/12/26 20:48:59 dernovoy
- * Log output
- *
- * Revision 1.2 2002/12/26 17:45:55 dicuccio
- * Reformatted code (reindent)
- *
- * Revision 1.1 2002/12/26 17:12:38 dernovoy
- * Initial revision.
- *
- * =====================================================================
- */