signal_seq.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:12k
- /*
- * ===========================================================================
- * PRODUCTION $Log: signal_seq.cpp,v $
- * PRODUCTION Revision 1000.2 2004/06/01 18:10:56 gouriano
- * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.7
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: signal_seq.cpp,v 1000.2 2004/06/01 18:10:56 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Authors: Josh Cherry
- *
- * File Description: Prediction of signal sequences from protein sequence
- * according to von Heijne, 1986 and 1987
- *
- */
- #include <ncbi_pch.hpp>
- #include <algo/sequence/signal_seq.hpp>
- BEGIN_NCBI_SCOPE
- USING_SCOPE(objects);
- // Scoring matrix for eukaryotic signal sequences
- // we have to declare these extern to be accessible from template on ForteCC.
- extern const double const_EukMat[26][15];
- extern const double const_BacMat[26][15];
- const double const_EukMat[26][15] = {
- {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- {0.0984401, -0.109199, -0.0350913, 0.0339016, 0.321584,
- 0.216223, 0.216223, 0.159065, 0.544727, 0.0339016,
- 1.176, -0.882389, 1.70788, 0.216223, -0.882389}, // A
- {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // B (?)
- {-0.405465, 0.287682, 0.693147, 0.441833, 0.693147,
- 1.13498, 0.287682, 0.575364, 0.105361, 0.287682,
- 1.44036, -0.405465, 0.693147, 0.575364, -0.405465}, // C
- {-2.18605, -2.18605, -2.18605, -2.18605, -2.18605,
- -2.18605, -2.18605, -2.18605, -0.576613, -1.08744,
- -25.2119, -0.576613, -25.2119, 0.116534, 0.211844}, // D
- {-2.30259, -2.30259, -2.30259, -2.30259, -2.30259,
- -2.30259, -2.30259, -2.30259, -1.20397, -0.356675,
- -25.3284, -0.356675, -25.3284, 0.262364, 0.336472}, // E
- {0.842183, 0.474458, 0.675129, 0.675129, 0.0689929,
- 0.223144, 1.16761, 0.842183, -0.336472, -0.113329,
- -24.7486, 0.842183, -24.7486, 0.0689929, -0.336472}, // F
- {-1.10691, -1.10691, -1.39459, -0.701446, -1.39459,
- 0.0717439, -1.39459, -1.80006, 0.451234, 1.03316,
- -0.883768, -0.547295, 1.17036, -0.19062, -0.547295}, // G
- {-1.22378, -1.22378, -1.22378, -1.22378, -1.22378,
- -1.22378, -1.22378, -1.22378, 0.385662, -1.22378,
- -24.2496, 0.567984, -24.2496, 0.162519, -0.530628}, // H
- {0.70657, 0.70657, 0.0779615, -0.209721, 0.396415,
- -0.392042, -0.615186, 0.0779615, -0.392042, -2.00148,
- 0.301105, -0.392042, -25.0273, 0.0779615, -0.0555699}, // I
- {-2.4248, -2.4248, -2.4248, -2.4248, -2.4248,
- -2.4248, -2.4248, -2.4248, -2.4248, -1.03851,
- -25.4507, -1.73166, -25.4507, -0.0269075, -0.227578}, // K
- {1.76947, 1.7263, 1.78346, 1.87624, 1.8635,
- 1.31346, 1.66568, 1.39861, -0.19062, 0.642289,
- -0.413764, 0.502527, -2.49321, -0.413764, -1.10691}, // L
- {-0.993252, 0.105361, 0.952658, 0.393043, -0.993252,
- 0.798508, -0.300105, -0.300105, -0.993252, -0.993252,
- -24.0191, -0.993252, -24.0191, -0.993252, -0.300105}, // M
- {-1.96009, -1.96009, -1.96009, -1.96009, -1.96009,
- -1.96009, -1.96009, -1.96009, -0.861482, -0.861482,
- -24.9859, 0.34249, -24.9859, -0.5738, -0.0141846}, // N
- {-1.30833, -2.00148, -1.30833, -2.00148, -2.00148,
- -0.615186, -2.00148, 0.0779615, 0.994252, 0.637577,
- -25.0273, -2.00148, -0.902868, -2.00148, 1.08956}, // P
- {-1.84055, -1.84055, -1.84055, -1.84055, -1.84055,
- -0.0487902, -1.84055, -1.84055, 0.462035, 0.238892,
- -24.8664, 1.04982, -0.741937, 1.10389, 0.462035}, // Q
- {-1.335, -2.02815, -2.02815, -2.02815, -2.02815,
- -2.02815, -2.02815, -2.02815, -0.0822381, -0.641854,
- -25.054, 0.679902, -25.054, 0.456758, 0.169076}, // R
- {-0.236389, -1.335, -0.354172, -0.641854, 0.131336,
- -0.131028, 0.274437, 0.338975, 0.824483, -0.0357181,
- 0.701881, 0.3996, 0.562119, 0.274437, -0.131028}, // S
- {-1.57898, 0.0304592, -0.662688, -0.885832, -0.662688,
- 0.292823, -0.326216, -0.326216, 0.212781, -0.480366,
- 0.561087, -0.192684, -0.480366, -1.17351, 0.0304592}, // T
- {0.588787, 0.811931, 0.301105, 0.483427, 0.158004,
- 0.301105, -0.00904984, 0.888892, -2.40695, 0.0779615,
- 1.05879, -1.30833, -25.4328, -0.327504, 0.426268}, // V
- {0.798508, 0.510826, 0.510826, -0.587787, -0.587787,
- 0.105361, 1.20397, 0.510826, -0.587787, 0.510826,
- -23.6136, 1.60944, -23.6136, 0.105361, -0.587787}, // W
- {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // X (?)
- {-1.72277, -1.72277, -0.336472, -1.72277, -1.72277,
- -1.72277, -0.624154, -1.72277, -1.72277, -1.02962,
- -24.7486, -0.113329, -24.7486, -1.72277, 0.223144}, // Y
- {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Z (?)
- {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // U (?)
- {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} // * (?)
- };
- // Scoring matrix for bacterial signal sequences
- const double const_BacMat[26][15] = {
- {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
- {1.13943, 0.916291, 0.916291, 1.03407, 0.628609,
- 0.782759, 0.446287, 0.628609, 0.782759, 0.782759,
- 2.0149, -0.470004, 2.27084, 1.72722, 0.223144}, // A
- {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // B (?)
- {0, 0, 0, 0, 0,
- 0, 0, 0, 0, 0,
- -23.0259, 0, -23.0259, 0, 0}, // C
- {-0.693147, -0.693147, -0.693147, -0.693147, -0.693147,
- -0.693147, -0.693147, -0.693147, -0.693147, -0.693147,
- -23.719, -0.693147, -23.719, 0, 1.38629}, // D
- {-0.788457, -0.788457, -0.788457, -0.788457, -0.788457,
- -0.788457, -0.788457, -0.788457, -0.788457, -0.788457,
- -23.8143, -0.788457, -23.8143, 0.597837, 1.29098}, // E
- {0.430783, 1.12393, 0.836248, 1.12393, -0.262364,
- -0.262364, 1.81708, -0.262364, 1.12393, -0.262364,
- -23.2882, 1.68355, -23.2882, -0.262364, -0.262364}, // F
- {0.393043, -0.300105, -0.300105, -0.300105, 0.105361,
- 0.616186, -0.300105, 0.393043, -0.300105, -0.300105,
- -24.0191, -0.300105, -0.300105, -0.993252, -0.993252}, // G
- {0.223144, 0.223144, 0.223144, 0.223144, 0.223144,
- 0.223144, 0.223144, 0.223144, 0.223144, 0.223144,
- -22.8027, 2.16905, -22.8027, 0.223144, 0.223144}, // H
- {0.567984, -0.530628, 1.07881, -0.530628, 1.07881,
- -0.530628, -0.530628, 0.567984, -0.530628, -0.530628,
- -23.5565, -0.530628, -23.5565, -0.530628, 0.162519}, // I
- {-0.916291, -0.916291, -0.916291, -0.916291, -0.916291,
- -0.916291, -0.916291, -0.916291, -0.916291, -0.916291,
- -23.9421, -0.223144, -23.9421, 0.182322, -0.916291}, // K
- {1.08619, 1.40464, 1.20397, 1.08619, 1.20397,
- 1.5717, -0.993252, -0.993252, -0.300105, -0.300105,
- -0.993252, -0.300105, -24.0191, -0.993252, -0.993252}, // L
- {0.510826, 1.20397, 0.510826, 0.510826, 1.60944,
- 1.20397, 1.60944, 0.510826, 0.510826, 1.20397,
- -22.515, 1.89712, -22.515, 0.510826, 0.510826}, // M
- {-0.470004, -0.470004, -0.470004, -0.470004, -0.470004,
- -0.470004, -0.470004, -0.470004, -0.470004, -0.470004,
- -23.4959, 0.628609, -23.4959, -0.470004, 0.916291}, // N
- {-0.530628, -0.530628, -0.530628, -0.530628, -0.530628,
- -0.530628, 0.162519, 0.567984, 1.07881, 0.162519,
- -23.5565, -0.530628, -23.5565, -0.530628, 1.07881}, // P
- {-0.336472, -0.336472, -0.336472, -0.336472, -0.336472,
- -0.336472, -0.336472, -0.336472, 0.356675, 0.356675,
- -23.3623, 0.76214, -23.3623, -0.336472, -0.336472}, // Q
- {-0.530628, -0.530628, -0.530628, -0.530628, -0.530628,
- -0.530628, -0.530628, -0.530628, -0.530628, -0.530628,
- -23.5565, -0.530628, -23.5565, -0.530628, -0.530628}, // R
- {-0.955511, -0.955511, -0.955511, 0.430783, 0.430783,
- -0.955511, 0.653926, 1.75254, 0.653926, 1.12393,
- 0.653926, -0.262364, -0.262364, -0.955511, -0.955511}, // S
- {-0.0953102, -0.788457, 0.597837, -0.0953102, -0.0953102,
- -0.0953102, -0.0953102, -0.0953102, 0.820981, -0.788457,
- 0.310155, -0.788457, -0.788457, -0.788457, -0.0953102}, // T
- {0.693147, 1.02962, -0.916291, 0.182322, -0.916291,
- 0.470004, 1.02962, -0.916291, -0.916291, 0.470004,
- 0.182322, -0.916291, -23.9421, -0.223144, -0.916291}, // V
- {0.916291, 0.916291, 0.916291, 0.916291, 0.916291,
- 0.916291, 0.916291, 0.916291, 0.916291, 0.916291,
- -22.1096, 0.916291, -22.1096, 0.916291, 0.916291}, // W
- {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // X (?)
- {-0.262364, -0.262364, -0.262364, -0.262364, -0.262364,
- -0.262364, -0.262364, -0.262364, -0.262364, 0.836248,
- -23.2882, -0.262364, -23.2882, -0.262364, -0.262364}, // Y
- {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // Z (?)
- {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, // U (?)
- {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} // * (?)
- };
- template<class Seq>
- void x_PredictSignalSeq(const Seq& seq, CSignalSeq::EDomain domain,
- TSeqPos max_pos, TSeqPos& pos, double& score)
- {
- const double (*Mat)[15];
- if (domain == CSignalSeq::eBacterial) {
- Mat = const_BacMat;
- } else {
- Mat = const_EukMat;
- }
- TSeqPos max_index = min((TSeqPos)seq.size() - 15, max_pos - 12);
-
- double max_score = -1e6;
- TSeqPos max_loc;
- for (unsigned int i = 0; i <= max_index; i++) {
- double sum = 0;
- for (unsigned int j = 0; j < 15; j++) {
- sum += Mat[seq[i + j]][j];
- }
- if (sum > max_score) {
- max_score = sum;
- max_loc = i;
- }
- }
- score = max_score;
- pos = max_loc + 12; // position before cut
- }
- void CSignalSeq::Predict(const string& seq, EDomain domain,
- TSeqPos max_pos, TSeqPos& pos, double& score)
- {
- x_PredictSignalSeq(seq, domain, max_pos, pos, score);
- }
- void CSignalSeq::Predict(const vector<char>& seq, EDomain domain,
- TSeqPos max_pos, TSeqPos& pos, double& score)
- {
- x_PredictSignalSeq(seq, domain, max_pos, pos, score);
- }
- void CSignalSeq::Predict(const CSeqVector& seq, EDomain domain,
- TSeqPos max_pos, TSeqPos& pos, double& score)
- {
- string seq_ncbistdaa;
- CSeqVector vec(seq);
- vec.SetNcbiCoding();
- vec.GetSeqData(0, vec.size(), seq_ncbistdaa);
- x_PredictSignalSeq(seq_ncbistdaa, domain, max_pos, pos, score);
- }
- END_NCBI_SCOPE
- /*
- * ===========================================================================
- * $Log: signal_seq.cpp,v $
- * Revision 1000.2 2004/06/01 18:10:56 gouriano
- * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.7
- *
- * Revision 1.7 2004/05/21 21:41:04 gorelenk
- * Added PCH ncbi_pch.hpp
- *
- * Revision 1.6 2004/03/16 19:40:09 vasilche
- * Made static const arrays accessible from template on ForteCC
- *
- * Revision 1.5 2004/03/15 12:30:19 dicuccio
- * Changed name of const arrays
- *
- * Revision 1.4 2004/03/12 19:59:31 dicuccio
- * Dropped static on private arrays as WorkShop doesn't let templates use such
- * arrays
- *
- * Revision 1.3 2004/03/11 17:27:16 dicuccio
- * Changed static member arrays to private static arrays
- *
- * Revision 1.2 2003/09/10 17:55:04 ucko
- * Add a cast to fix 64-bit compilation.
- *
- * Revision 1.1 2003/09/10 15:31:34 jcherry
- * Initial version
- *
- * ===========================================================================
- */