aln_scoring.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:10k
- /*
- * ===========================================================================
- * PRODUCTION $Log: aln_scoring.cpp,v $
- * PRODUCTION Revision 1000.3 2004/06/01 21:07:04 gouriano
- * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.11
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: aln_scoring.cpp,v 1000.3 2004/06/01 21:07:04 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Authors: Andrey Yazhuk
- */
- #include <ncbi_pch.hpp>
- #include <gui/widgets/aln_multiple/alnmulti_settings.hpp>
- #include <gui/widgets/aln_multiple/aln_scoring.hpp>
- #include <math.h>
- #include <stdio.h>
- BEGIN_NCBI_SCOPE
- USING_SCOPE(objects);
- CSimpleScoringMethod::CSimpleScoringMethod()
- : m_Space(0), m_Gap(0)
- {
- m_vCharCounts.resize(256);
- SetOptions(fIgnoreEmptySpace /*| fIgnoreGaps*/);
- }
- CSimpleScoringMethod::~CSimpleScoringMethod()
- {
- }
- void CSimpleScoringMethod::SetOptions(int options)
- {
- m_Options = options;
- m_Space = (m_Options & fIgnoreEmptySpace) ? ' ' : 0;
- m_Gap = (m_Options & fIgnoreGaps) ? '-' : 0;
- }
- void CSimpleScoringMethod::CreateColorTable(int size)
- {
- m_vColors.resize(size);
- float K = 1.0f / (size - 1);
- // unperfect agreement - gradient red
- for( int i = 0; i < size - 1; i++ ) {
- float score = i * K;
- float gray = score * 0.8f;
- m_vColors[i] = CGlColor(1.0f, gray, gray);
- }
- m_vColors[size - 1] = CGlColor(0.9f, 0.9f, 0.9f); // perefect agreement - light gray
- }
- string CSimpleScoringMethod::GetName()
- {
- return "Strict column agreement";
- }
- void CSimpleScoringMethod::CalculateScores(char cons, const string& column,
- TScore& col_score, TScoreVector& scores)
- {
- _ASSERT(scores.size() == column.size() && m_vCharCounts.size() == 256);
-
- // reset histogram
- size_t space_n = 0;
- fill(m_vCharCounts.begin(), m_vCharCounts.end(), 0);
-
- // calculate histogram
- ITERATE(string, it, column) {
- char c = *it;
- if(c != m_Space && c != m_Gap) {
- ++m_vCharCounts[(size_t) c];
- } else {
- space_n++;
- }
- }
-
- // calculate column score
- /*int max_count = 0, sum = 0;
- for( int i = 0; i < 256; i++ ) { // ### performance
- int n = m_vCharCounts[i];
- if(n) {
- sum += n * n;
- max_count = max(n, max_count);
- }
- } */
- size_t total = column.size() - space_n;
- //col_score = sqrt((TScore)sum) / total;
- col_score = 0.0; //###
-
- // calculate individual scores
- for( size_t i = 0; i < column.size(); i++ ) {
- char c = column[i];
- if(c != m_Space && c != m_Gap) {
- scores[i] = ((TScore) m_vCharCounts[(size_t) c]) / total;
- } else {
- scores[i] = 1.0;
- }
- }
- }
- string CSNPScoringMethod::GetName()
- {
- return "SNP Highlighting";
- }
- void CSNPScoringMethod::CalculateScores(char cons, const string& column,
- TScore& col_score, TScoreVector& scores)
- {
- _ASSERT(scores.size() == column.size());
-
- for( size_t i = 0; i < column.size(); i++ ) {
- char c = column[i];
- if(c != m_Space && c != m_Gap) {
- scores[i] = (c == cons) ? 1.0 : 0.0;
- } else {
- scores[i] = 1.0;
- }
- }
- }
- const CGlColor& CSimpleScoringMethod::GetColorForScore(TScore score) const
- {
- _ASSERT(m_vColors.size());
- size_t ind = (size_t) floor(score * m_vColors.size());
- if(ind == m_vColors.size())
- --ind;
- return m_vColors[ind];
- }
- CScoreCache::CScoreCache()
- : m_pAlnVec(NULL),
- m_pMethod(NULL),
- m_GradNumber(16)
- {
- }
-
- void CScoreCache::SetGradNumber(int grad_n)
- {
- _ASSERT(grad_n > 1 && grad_n <= 0xFFFF);
- m_GradNumber = grad_n;
- }
- void CScoreCache::SetScoringMethod(IScoringMethod *method)
- {
- m_pMethod = method;
- }
- IScoringMethod* CScoreCache::GetScoringMethod()
- {
- return m_pMethod;
- }
- const IScoringMethod* CScoreCache::GetScoringMethod() const
- {
- return m_pMethod;
- }
- void CScoreCache::SetAlnVec(const CAlnVec* aln_vec)
- {
- m_pAlnVec = aln_vec;
- }
- /// Calculates scores for the given CAlnVec object and saves results in form of
- /// TScoreColl objects.
- void CScoreCache::CalculateScores()
- {
- _ASSERT(m_pAlnVec);
- _ASSERT(m_pMethod);
- CStopWatch sw;
- sw.Start();
- TSeqPos start = m_pAlnVec->GetAlnStart();
- TSeqPos stop = m_pAlnVec->GetAlnStop();
- TNumrow row_n = m_pAlnVec->GetNumRows();
- // preparing score collections
- m_vScoreColls.resize(row_n);
- NON_CONST_ITERATE(TScoreCollVector, itC, m_vScoreColls) {
- itC->SetFrom(start); // clear and initialize
- }
- string column(row_n, ' ');
- TScore col_score = 0;
- TScoreVector v_col_scores(row_n, 0.0f);
- const TSeqPos kPageSize = 256;
- x_AllocBuffer(kPageSize);
-
- TScore grad_n = m_GradNumber;
- TNumrow cons_row = m_pAlnVec->GetAnchor();
- // iterate from "start" to "stop" using "sliding buffer"
- for( TSeqPos pos = start; pos < stop; ) {
-
- TSeqPos pos_stop = min(pos + kPageSize -1, stop);
- x_UpdateBuffer(pos, pos_stop); // fetch next page in Seq Buffer
- for( TSeqPos p = pos; p <= pos_stop ; p++ ) { // for each column
- x_BufferGetColumn(p, column);
- char cons = (cons_row > -1) ? column[cons_row] : 0;
- m_pMethod->CalculateScores(cons, column, col_score, v_col_scores);
-
- // append scores to collections
- for(TNumrow r = 0; r < row_n; r++ ) {
- TScore sc = v_col_scores[r];
- sc = ((int) (sc * grad_n)) / grad_n;
- m_vScoreColls[r].push_back(sc);
- }
- }
- pos = pos_stop + 1;
- }
- int total_int = 0;
- NON_CONST_ITERATE(TScoreCollVector, itC, m_vScoreColls) {
- total_int += itC->size();
- }
- char s[128];
- sprintf(s, "CScoreCache::CalculateScores() - total rows - %d intervals %d", row_n, total_int);
- LOG_POST(s);
-
- CAlnMultiUtils::ReportElapced("CScoreCache::CalculateScores()", sw);
- }
- const CScoreCache::TScoreColl& CScoreCache::GetScores(TNumrow row) const
- {
- _ASSERT(row >= 0 && row < (TNumrow) m_vScoreColls.size());
- return m_vScoreColls[row];
- }
- ///////////////////////////////////////////////////////////////////////////////
- /// Sequence buffer management routins
- inline char CScoreCache::x_BufferGetSeq(TSeqPos pos, TNumrow row) const
- {
- _ASSERT(pos >= m_BufferStart && pos < m_BufferStart + m_RowLength);
- _ASSERT(row >= 0 && row < (TNumrow) m_vRows.size());
- return m_vRows[row][pos - m_BufferStart];
- }
- void CScoreCache::x_AllocBuffer(TSeqPos row_len)
- {
- _ASSERT(m_pAlnVec);
- int rows_n = m_pAlnVec->GetNumRows();
- if(rows_n != (TNumrow) m_vRows.size() || m_RowLength != row_len) {
- m_RowLength = row_len;
-
- m_vRows.resize(rows_n);
- NON_CONST_ITERATE(vector<string>, itR, m_vRows) {
- itR->resize(m_RowLength);
- }
- }
- }
- void CScoreCache::x_FreeBuffer()
- {
- m_vRows.clear();
- }
- void CScoreCache::x_UpdateBuffer(TSeqPos start, TSeqPos stop)
- {
- _ASSERT(m_pAlnVec);
- _ASSERT( (stop - start + 1) <= m_RowLength);
-
- m_BufferStart = start;
- CAlnVec::TSignedRange range(start, stop);
- TNumrow row_n = (TNumrow) m_vRows.size();
- for( TNumrow r = 0; r < row_n; r++ ) {
- m_pAlnVec->GetAlnSeqString(m_vRows[r], r, range);
- }
- }
- void CScoreCache::x_BufferGetColumn(TSeqPos pos, string& column) const
- {
- _ASSERT(pos >= m_BufferStart && pos < m_BufferStart + m_RowLength);
-
- size_t col = pos - m_BufferStart;
- for(size_t row = 0; row < m_vRows.size(); row++ ) {
- column[row] = m_vRows[row][col];
- }
- }
- END_NCBI_SCOPE
- /*
- * ===========================================================================
- * $Log: aln_scoring.cpp,v $
- * Revision 1000.3 2004/06/01 21:07:04 gouriano
- * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.11
- *
- * Revision 1.11 2004/05/21 22:27:52 gorelenk
- * Added PCH ncbi_pch.hpp
- *
- * Revision 1.10 2004/04/02 16:38:11 yazhuk
- * Added to CSimpleScoringMethod options for ignoring empty space and gaps;
- * Added CSNPScoringMethod .
- *
- * Revision 1.9 2004/03/18 17:09:03 yazhuk
- * Added GetScoringMethod()
- *
- * Revision 1.8 2004/02/18 02:16:21 ucko
- * Tweak to avoid trying to invoke sqrt on an int.
- *
- * Revision 1.7 2004/02/17 15:20:51 yazhuk
- * Optimized scores calculation
- *
- * Revision 1.6 2004/02/11 17:43:09 yazhuk
- * Implemented GetName(); added comments
- *
- * Revision 1.5 2004/02/11 15:27:42 yazhuk
- * Changed color table generation
- *
- * Revision 1.4 2003/11/14 15:45:48 ucko
- * Likewise fix initialization of v_col_scores in
- * CScoreCache::CalculateScores for Compaq's compiler.
- * Qualify method names in previous log messages.
- *
- * Revision 1.3 2003/11/14 13:10:14 ucko
- * Tweak constructor of vCounts in CSimpleScoringMethod::CalculateScores
- * for Compaq's compiler.
- *
- * Revision 1.2 2003/10/11 18:20:34 ucko
- * Fixes for GCC 2.95: #include <stdio.h> for sprintf(); tweak constr. of
- * column in CScoreCache::CalculateScores to avoid triggering an inappropriate
- * template.
- *
- * Revision 1.1 2003/10/10 19:06:25 yazhuk
- * Initial revision
- *
- * ===========================================================================
- */