user_feature_dload.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:12k
- /*
- * ===========================================================================
- * PRODUCTION $Log: user_feature_dload.cpp,v $
- * PRODUCTION Revision 1000.2 2004/06/01 19:42:52 gouriano
- * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.3
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: user_feature_dload.cpp,v 1000.2 2004/06/01 19:42:52 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Authors: Josh Cherry
- *
- * File Description: Data loader for user features in tabular form
- *
- */
- #include <ncbi_pch.hpp>
- #include <objtools/data_loaders/table/user_feature_dload.hpp>
- #include <sqlite/sqlite.hpp>
- #include <objects/general/User_object.hpp>
- #include <objects/general/User_field.hpp>
- #include <objects/general/Object_id.hpp>
- #include <objects/seqfeat/Seq_feat.hpp>
- #include <objects/seqloc/Seq_loc.hpp>
- #include <objects/seqloc/Seq_id.hpp>
- #include <objects/seqloc/Seq_interval.hpp>
- #include <objects/seqset/Seq_entry.hpp>
- #include <objects/seqset/Bioseq_set.hpp>
- #include <objmgr/scope.hpp>
- #include <objmgr/impl/data_source.hpp>
- #include <objmgr/impl/synonyms.hpp>
- #include <objmgr/impl/handle_range_map.hpp>
- #include <serial/serial.hpp>
- #include <serial/objostr.hpp>
- BEGIN_NCBI_SCOPE
- USING_SCOPE(objects);
- bool CUsrFeatDataLoader::SIdHandleByContent::operator()
- (const CSeq_id_Handle& h1, const CSeq_id_Handle& h2) const
- {
- CConstRef<CSeq_id> id1 = h1.GetSeqId();
- CConstRef<CSeq_id> id2 = h2.GetSeqId();
- return (*id1 < *id2);
- }
- CUsrFeatDataLoader::CUsrFeatDataLoader(const string& input_file,
- const string& temp_file,
- bool delete_file,
- EOffset offset,
- const string& type,
- const CSeq_id* given_id)
- : CDataLoader(input_file), m_Offset(offset), m_Type(type)
- {
- //
- // create our SQLite DB
- //
- m_Table.Reset(new CSQLiteTable(input_file, temp_file, delete_file));
- //
- // now, store some precalculated info about our table
- //
- {{
- // extract the column names
- list<string> cols;
- m_Table->GetColumnTitles(cols);
- m_Cols.reserve(cols.size());
- std::copy(cols.begin(), cols.end(), back_inserter(m_Cols));
- }}
- // determine our column mapping
- int i = 0;
- m_ColAssign.resize(m_Cols.size(), eUnknown);
- fill(m_ColIdx, m_ColIdx + eMaxKnownCol, -1);
- ITERATE(vector<string>, iter, m_Cols) {
- string str(*iter);
- NStr::ToLower(str);
- if (str == "contig" ||
- str == "contig_accession" ||
- str == "accession" ||
- str == "id") {
- m_ColAssign[i] = eAccession;
- m_ColIdx[eAccession] = i;
- } else if (str == "from") {
- m_ColAssign[i] = eFrom;
- m_ColIdx[eFrom] = i;
- } else if (str == "to") {
- m_ColAssign[i] = eTo;
- m_ColIdx[eTo] = i;
- } else if (str == "type") {
- m_ColAssign[i] = eType;
- m_ColIdx[eType] = i;
- } else if (str == "strand" ||
- str == "orientation") {
- m_ColAssign[i] = eStrand;
- m_ColIdx[eStrand] = i;
- }
- ++i;
- }
- if (given_id) {
- m_SeqId.Reset(given_id);
- }
- if (!m_SeqId && m_ColIdx[eAccession] == -1) {
- LOG_POST(Info << "CUsrFeatDataLoader: no id column in file, "
- "and no id given as parameter");
- throw runtime_error("no id column in file, "
- "and no id given as parameter");
- }
- CSQLite& sqlite = m_Table->SetDB();
- if (!m_SeqId) {
- string acc_col = "col" + NStr::IntToString(m_ColIdx[eAccession]);
- // create an index on accession
- try {
- sqlite.Execute("create index IDX_accession "
- "on TableData (" + acc_col + ")");
- }
- catch (...) {
- // index already exists - ignored
- }
- // extract a list of the accessions we have
- CRef<CSQLiteQuery> q
- (sqlite.Compile("select distinct " + acc_col +
- " from TableData order by " + acc_col));
- int count;
- const char** data = NULL;
- const char** cols = NULL;
- while (q->NextRow(count, data, cols)) {
- CRef<CSeq_id> id(new CSeq_id(data[0]));
- if (id->Which() == CSeq_id::e_not_set) {
- LOG_POST(Error << "failed to index id = " << data[0]);
- continue;
- }
- CSeq_id_Handle handle = CSeq_id_Handle::GetHandle(*id);
- m_Ids.insert(TIdMap::value_type(handle, data[0]));
- _TRACE(" id = " << data[0]);
- }
- LOG_POST(Info << "CUsrFeatDataLoader: "
- << m_Ids.size() << " distinct ids");
- } else {
- LOG_POST(Info << "CUsrFeatDataLoader: using single "
- "specified id");
- }
- }
- // Request from a datasource using handles and ranges instead of seq-loc
- // The TSEs loaded in this call will be added to the tse_set.
- void CUsrFeatDataLoader::GetRecords(const CSeq_id_Handle& idh,
- EChoice choice)
- {
- //
- // find out if we've already loaded annotations for this seq-id
- //
- TEntries::iterator iter = m_Entries.find(idh);
- if (iter != m_Entries.end()) {
- return;
- }
- CRef<CSeq_annot> annot = GetAnnot(idh);
- if (!annot) {
- return;
- }
- CRef<CSeq_entry> entry;
- // we then add the object to the data loader
- // we need to create a dummy TSE for it first
- entry.Reset(new CSeq_entry());
- entry->SetSet().SetSeq_set();
- entry->SetSet().SetAnnot().push_back(annot);
- GetDataSource()->AddTSE(*entry);
-
- _TRACE("CUsrFeatDataLoader(): loaded "
- << annot->GetData().GetFtable().size()
- << " features for " << idh.AsString());
-
- // we always save an entry here. If the entry is empty,
- // we have no information about this sequence, but we at
- // least don't need to repeat an expensive search
- m_Entries[idh] = entry;
- }
- CRef<CSeq_annot> CUsrFeatDataLoader::GetAnnot(const CSeq_id_Handle& idh)
- {
- CRef<CSeq_annot> annot;
- CSQLiteTable::TIterator row_iter;
- if (!m_SeqId) {
- //
- // find out if this ID is in our list of ids
- //
- pair<TIdMap::iterator, TIdMap::iterator> id_iter
- = m_Ids.equal_range(idh);
- if (id_iter.first == id_iter.second) {
- return annot; // null CRef
- }
- // select just the rows with that match the id
- string acc_col = "col" + NStr::IntToString(m_ColIdx[eAccession]);
- string sql("select * from TableData where " + acc_col +
- " in (");
- string tmp;
- for ( ; id_iter.first != id_iter.second; ++id_iter.first) {
- TIdMap::iterator iter = id_iter.first;
-
- if ( !tmp.empty() ) {
- tmp += ", ";
- }
- tmp += "'" + iter->second + "'";
- }
- sql += tmp + ")";
- row_iter = m_Table->Begin(sql);
-
- } else {
- // check that this is the right id
- if (!idh.GetSeqId()->Match(*m_SeqId)) {
- return annot; // null CRef
- }
- // select 'em all
- row_iter = m_Table->Begin("select * from TableData");
- }
- annot.Reset(new CSeq_annot());
- vector<string> data;
- for ( ; *row_iter; ++(*row_iter)) {
- list<string> temp;
- (*row_iter).GetRow(temp);
- data.resize(temp.size());
- std::copy(temp.begin(), temp.end(), data.begin());
- // create a new feature
- CRef<CSeq_feat> feat(new CSeq_feat());
- CSeq_loc& loc = feat->SetLocation();
- loc.SetInt().SetId().Assign(*idh.GetSeqId());
- CUser_object& user = feat->SetData().SetUser();
- // fill in our columns
- TSeqPos from;
- TSeqPos to;
- string strand_str;
- for (unsigned int i = 0; i < data.size(); ++i) {
- switch (m_ColAssign[i]) {
- case eAccession:
- // already handled as ID...
- break;
- case eStrand:
- strand_str = NStr::ToLower(data[i]);
- if (strand_str == "+" || strand_str == "plus" ||
- strand_str == "positive" || strand_str == "forward") {
- loc.SetInt().SetStrand(eNa_strand_plus);
- } else if (strand_str == "-" || strand_str == "minus" ||
- strand_str == "negative" ||
- strand_str == "reverse") {
- loc.SetInt().SetStrand(eNa_strand_minus);
- } else if (strand_str == "b" || strand_str == "+/-" ||
- strand_str == "both") {
- loc.SetInt().SetStrand(eNa_strand_both);
- } else {
- throw runtime_error(string("Invalid strand designation: ")
- + data[i]);
- }
- break;
- case eFrom:
- from = NStr::StringToInt(data[i]);
- break;
- case eTo:
- to = NStr::StringToInt(data[i]);
- break;
- case eType:
- user.SetType().SetStr(data[i]);
- break;
- case eUnknown:
- default:
- // add a user field, unless column title
- // starts with '.'
- if (!NStr::StartsWith(m_Cols[i], ".")) {
- user.AddField(m_Cols[i], data[i]);
- }
- break;
- }
- }
- loc.SetInt().SetFrom(from - m_Offset);
- loc.SetInt().SetTo (to - m_Offset);
- if (!m_Type.empty() || m_ColIdx[eType] == -1) {
- user.SetType().SetStr(m_Type);
- }
- annot->SetData().SetFtable().push_back(feat);
- }
- return annot;
- }
- END_NCBI_SCOPE
- /*
- * ===========================================================================
- * $Log: user_feature_dload.cpp,v $
- * Revision 1000.2 2004/06/01 19:42:52 gouriano
- * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.3
- *
- * Revision 1.3 2004/05/21 21:42:53 gorelenk
- * Added PCH ncbi_pch.hpp
- *
- * Revision 1.2 2003/11/28 13:41:10 dicuccio
- * Fixed to match new API in CDataLoader
- *
- * Revision 1.1 2003/11/14 19:09:18 jcherry
- * Initial version
- *
- * ===========================================================================
- */