blast_stat.c
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:113k
- /*
- * ===========================================================================
- * PRODUCTION $Log: blast_stat.c,v $
- * PRODUCTION Revision 1000.6 2004/06/01 18:07:50 gouriano
- * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.77
- * PRODUCTION
- * ===========================================================================
- */
- /* $Id: blast_stat.c,v 1000.6 2004/06/01 18:07:50 gouriano Exp $
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Author: Tom Madden
- *
- */
- /** @file blast_stat.c
- * Functions to calculate BLAST probabilities etc.
- * Detailed Contents:
- *
- * - allocate and deallocate structures used by BLAST to calculate
- * probabilities etc.
- *
- * - calculate residue frequencies for query and "average" database.
- *
- * - read in matrix or load it from memory.
- *
- * - calculate sum-p from a collection of HSP's, for both the case
- * of a "small" gap and a "large" gap, when give a total score and the
- * number of HSP's.
- *
- * - calculate expect values for p-values.
- *
- * - calculate pseuod-scores from p-values.
- *
- * @todo FIXME needs doxygen comments
- */
- static char const rcsid[] =
- "$Id: blast_stat.c,v 1000.6 2004/06/01 18:07:50 gouriano Exp $";
- #include <algo/blast/core/blast_stat.h>
- #include <algo/blast/core/blast_util.h>
- #include <util/tables/raw_scoremat.h>
- #include <algo/blast/core/blast_encoding.h>
- #include "blast_psi_priv.h"
- /* OSF1 apparently doesn't like this. */
- #if defined(HUGE_VAL) && !defined(OS_UNIX_OSF1)
- #define BLASTKAR_HUGE_VAL HUGE_VAL
- #else
- #define BLASTKAR_HUGE_VAL 1.e30
- #endif
- /* Allocates and Deallocates the two-dimensional matrix. */
- static SBLASTMatrixStructure* BlastMatrixAllocate (Int2 alphabet_size);
- /* performs sump calculation, used by BlastSumPStd */
- static double BlastSumPCalc (int r, double s);
- #define COMMENT_CHR '#'
- #define TOKSTR " tnr"
- #define BLAST_MAX_ALPHABET 40 /* ncbistdaa is only 26, this should be enough */
- /*
- How many of the first bases are not ambiguous
- (it's four, of course).
- */
- #define NUMBER_NON_AMBIG_BP 4
- /* Used in BlastKarlinBlkGappedCalc */
- typedef double array_of_8[8];
- /* Used to temporarily store matrix values for retrieval. */
- typedef struct MatrixInfo {
- char* name; /* name of matrix (e.g., BLOSUM90). */
- array_of_8 *values; /* The values (below). */
- Int4 *prefs; /* Preferences for display. */
- Int4 max_number_values; /* number of values (e.g., BLOSUM90_VALUES_MAX). */
- } MatrixInfo;
- /**************************************************************************************
- How the statistical parameters for the matrices are stored:
- -----------------------------------------------------------
- They parameters are stored in a two-dimensional array double (i.e.,
- doubles, which has as it's first dimensions the number of different
- gap existence and extension combinations and as it's second dimension 8.
- The eight different columns specify:
- 1.) gap existence penalty (INT2_MAX denotes infinite).
- 2.) gap extension penalty (INT2_MAX denotes infinite).
- 3.) decline to align penalty (INT2_MAX denotes infinite).
- 4.) Lambda
- 5.) K
- 6.) H
- 7.) alpha
- 8.) beta
- (Items 4-8 are explained in:
- Altschul SF, Bundschuh R, Olsen R, Hwa T.
- The estimation of statistical parameters for local alignment score distributions.
- Nucleic Acids Res. 2001 Jan 15;29(2):351-61.).
- Take BLOSUM45 (below) as an example. Currently (5/17/02) there are
- 14 different allowed combinations (specified by "#define BLOSUM45_VALUES_MAX 14").
- The first row in the array "blosum45_values" has INT2_MAX (i.e., 32767) for gap
- existence, extension, and decline-to-align penalties. For all practical purposes
- this value is large enough to be infinite, so the alignments will be ungapped.
- BLAST may also use this value (INT2_MAX) as a signal to skip gapping, so a
- different value should not be used if the intent is to have gapless extensions.
- The next row is for the gap existence penalty 13 and the extension penalty 3.
- The decline-to-align penalty is only supported in a few cases, so it is normally
- set to INT2_MAX.
- How to add a new matrix to blast_stat.c:
- --------------------------------------
- To add a new matrix to blast_stat.c it is necessary to complete
- four steps. As an example consider adding the matrix
- called TESTMATRIX
- 1.) add a define specifying how many different existence and extensions
- penalties are allowed, so it would be necessary to add the line:
- #define TESTMATRIX_VALUES_MAX 14
- if 14 values were to be allowed.
- 2.) add a two-dimensional array to contain the statistical parameters:
- static double testmatrix_values[TESTMATRIX_VALUES_MAX][8] ={ ...
- 3.) add a "prefs" array that should hint about the "optimal"
- gap existence and extension penalties:
- static Int4 testmatrix_prefs[TESTMATRIX_VALUES_MAX] = {
- BLAST_MATRIX_NOMINAL,
- ...
- };
- 4.) Go to the function BlastLoadMatrixValues (in this file) and
- add two lines before the return at the end of the function:
- matrix_info = MatrixInfoNew("TESTMATRIX", testmatrix_values, testmatrix_prefs, TESTMATRIX_VALUES_MAX);
- ListNodeAddPointer(&retval, 0, matrix_info);
- ***************************************************************************************/
-
-
- #define BLOSUM45_VALUES_MAX 14
- static double blosum45_values[BLOSUM45_VALUES_MAX][8] = {
- {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.2291, 0.0924, 0.2514, 0.9113, -5.7},
- {13, 3, (double) INT2_MAX, 0.207, 0.049, 0.14, 1.5, -22},
- {12, 3, (double) INT2_MAX, 0.199, 0.039, 0.11, 1.8, -34},
- {11, 3, (double) INT2_MAX, 0.190, 0.031, 0.095, 2.0, -38},
- {10, 3, (double) INT2_MAX, 0.179, 0.023, 0.075, 2.4, -51},
- {16, 2, (double) INT2_MAX, 0.210, 0.051, 0.14, 1.5, -24},
- {15, 2, (double) INT2_MAX, 0.203, 0.041, 0.12, 1.7, -31},
- {14, 2, (double) INT2_MAX, 0.195, 0.032, 0.10, 1.9, -36},
- {13, 2, (double) INT2_MAX, 0.185, 0.024, 0.084, 2.2, -45},
- {12, 2, (double) INT2_MAX, 0.171, 0.016, 0.061, 2.8, -65},
- {19, 1, (double) INT2_MAX, 0.205, 0.040, 0.11, 1.9, -43},
- {18, 1, (double) INT2_MAX, 0.198, 0.032, 0.10, 2.0, -43},
- {17, 1, (double) INT2_MAX, 0.189, 0.024, 0.079, 2.4, -57},
- {16, 1, (double) INT2_MAX, 0.176, 0.016, 0.063, 2.8, -67},
- };
- static Int4 blosum45_prefs[BLOSUM45_VALUES_MAX] = {
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_BEST,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL
- };
- #define BLOSUM50_VALUES_MAX 16
- static double blosum50_values[BLOSUM50_VALUES_MAX][8] = {
- {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.2318, 0.112, 0.3362, 0.6895, -4.0},
- {13, 3, (double) INT2_MAX, 0.212, 0.063, 0.19, 1.1, -16},
- {12, 3, (double) INT2_MAX, 0.206, 0.055, 0.17, 1.2, -18},
- {11, 3, (double) INT2_MAX, 0.197, 0.042, 0.14, 1.4, -25},
- {10, 3, (double) INT2_MAX, 0.186, 0.031, 0.11, 1.7, -34},
- {9, 3, (double) INT2_MAX, 0.172, 0.022, 0.082, 2.1, -48},
- {16, 2, (double) INT2_MAX, 0.215, 0.066, 0.20, 1.05, -15},
- {15, 2, (double) INT2_MAX, 0.210, 0.058, 0.17, 1.2, -20},
- {14, 2, (double) INT2_MAX, 0.202, 0.045, 0.14, 1.4, -27},
- {13, 2, (double) INT2_MAX, 0.193, 0.035, 0.12, 1.6, -32},
- {12, 2, (double) INT2_MAX, 0.181, 0.025, 0.095, 1.9, -41},
- {19, 1, (double) INT2_MAX, 0.212, 0.057, 0.18, 1.2, -21},
- {18, 1, (double) INT2_MAX, 0.207, 0.050, 0.15, 1.4, -28},
- {17, 1, (double) INT2_MAX, 0.198, 0.037, 0.12, 1.6, -33},
- {16, 1, (double) INT2_MAX, 0.186, 0.025, 0.10, 1.9, -42},
- {15, 1, (double) INT2_MAX, 0.171, 0.015, 0.063, 2.7, -76},
- };
- static Int4 blosum50_prefs[BLOSUM50_VALUES_MAX] = {
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_BEST,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL
- };
- #define BLOSUM62_VALUES_MAX 12
- static double blosum62_values[BLOSUM62_VALUES_MAX][8] = {
- {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3176, 0.134, 0.4012, 0.7916, -3.2},
- {11, 2, (double) INT2_MAX, 0.297, 0.082, 0.27, 1.1, -10},
- {10, 2, (double) INT2_MAX, 0.291, 0.075, 0.23, 1.3, -15},
- {9, 2, (double) INT2_MAX, 0.279, 0.058, 0.19, 1.5, -19},
- {8, 2, (double) INT2_MAX, 0.264, 0.045, 0.15, 1.8, -26},
- {7, 2, (double) INT2_MAX, 0.239, 0.027, 0.10, 2.5, -46},
- {6, 2, (double) INT2_MAX, 0.201, 0.012, 0.061, 3.3, -58},
- {13, 1, (double) INT2_MAX, 0.292, 0.071, 0.23, 1.2, -11},
- {12, 1, (double) INT2_MAX, 0.283, 0.059, 0.19, 1.5, -19},
- {11, 1, (double) INT2_MAX, 0.267, 0.041, 0.14, 1.9, -30},
- {10, 1, (double) INT2_MAX, 0.243, 0.024, 0.10, 2.5, -44},
- {9, 1, (double) INT2_MAX, 0.206, 0.010, 0.052, 4.0, -87},
- };
- static Int4 blosum62_prefs[BLOSUM62_VALUES_MAX] = {
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_BEST,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- };
- #define BLOSUM80_VALUES_MAX 10
- static double blosum80_values[BLOSUM80_VALUES_MAX][8] = {
- {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3430, 0.177, 0.6568, 0.5222, -1.6},
- {25, 2, (double) INT2_MAX, 0.342, 0.17, 0.66, 0.52, -1.6},
- {13, 2, (double) INT2_MAX, 0.336, 0.15, 0.57, 0.59, -3},
- {9, 2, (double) INT2_MAX, 0.319, 0.11, 0.42, 0.76, -6},
- {8, 2, (double) INT2_MAX, 0.308, 0.090, 0.35, 0.89, -9},
- {7, 2, (double) INT2_MAX, 0.293, 0.070, 0.27, 1.1, -14},
- {6, 2, (double) INT2_MAX, 0.268, 0.045, 0.19, 1.4, -19},
- {11, 1, (double) INT2_MAX, 0.314, 0.095, 0.35, 0.90, -9},
- {10, 1, (double) INT2_MAX, 0.299, 0.071, 0.27, 1.1, -14},
- {9, 1, (double) INT2_MAX, 0.279, 0.048, 0.20, 1.4, -19},
- };
- static Int4 blosum80_prefs[BLOSUM80_VALUES_MAX] = {
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_BEST,
- BLAST_MATRIX_NOMINAL
- };
- #define BLOSUM90_VALUES_MAX 8
- static double blosum90_values[BLOSUM90_VALUES_MAX][8] = {
- {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3346, 0.190, 0.7547, 0.4434, -1.4},
- {9, 2, (double) INT2_MAX, 0.310, 0.12, 0.46, 0.67, -6},
- {8, 2, (double) INT2_MAX, 0.300, 0.099, 0.39, 0.76, -7},
- {7, 2, (double) INT2_MAX, 0.283, 0.072, 0.30, 0.93, -11},
- {6, 2, (double) INT2_MAX, 0.259, 0.048, 0.22, 1.2, -16},
- {11, 1, (double) INT2_MAX, 0.302, 0.093, 0.39, 0.78, -8},
- {10, 1, (double) INT2_MAX, 0.290, 0.075, 0.28, 1.04, -15},
- {9, 1, (double) INT2_MAX, 0.265, 0.044, 0.20, 1.3, -19},
- };
- static Int4 blosum90_prefs[BLOSUM90_VALUES_MAX] = {
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_BEST,
- BLAST_MATRIX_NOMINAL
- };
- #define PAM250_VALUES_MAX 16
- static double pam250_values[PAM250_VALUES_MAX][8] = {
- {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.2252, 0.0868, 0.2223, 0.98, -5.0},
- {15, 3, (double) INT2_MAX, 0.205, 0.049, 0.13, 1.6, -23},
- {14, 3, (double) INT2_MAX, 0.200, 0.043, 0.12, 1.7, -26},
- {13, 3, (double) INT2_MAX, 0.194, 0.036, 0.10, 1.9, -31},
- {12, 3, (double) INT2_MAX, 0.186, 0.029, 0.085, 2.2, -41},
- {11, 3, (double) INT2_MAX, 0.174, 0.020, 0.070, 2.5, -48},
- {17, 2, (double) INT2_MAX, 0.204, 0.047, 0.12, 1.7, -28},
- {16, 2, (double) INT2_MAX, 0.198, 0.038, 0.11, 1.8, -29},
- {15, 2, (double) INT2_MAX, 0.191, 0.031, 0.087, 2.2, -44},
- {14, 2, (double) INT2_MAX, 0.182, 0.024, 0.073, 2.5, -53},
- {13, 2, (double) INT2_MAX, 0.171, 0.017, 0.059, 2.9, -64},
- {21, 1, (double) INT2_MAX, 0.205, 0.045, 0.11, 1.8, -34},
- {20, 1, (double) INT2_MAX, 0.199, 0.037, 0.10, 1.9, -35},
- {19, 1, (double) INT2_MAX, 0.192, 0.029, 0.083, 2.3, -52},
- {18, 1, (double) INT2_MAX, 0.183, 0.021, 0.070, 2.6, -60},
- {17, 1, (double) INT2_MAX, 0.171, 0.014, 0.052, 3.3, -86},
- };
- static Int4 pam250_prefs[PAM250_VALUES_MAX] = {
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_BEST,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL
- };
- #define PAM30_VALUES_MAX 7
- static double pam30_values[PAM30_VALUES_MAX][8] = {
- {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3400, 0.283, 1.754, 0.1938, -0.3},
- {7, 2, (double) INT2_MAX, 0.305, 0.15, 0.87, 0.35, -3},
- {6, 2, (double) INT2_MAX, 0.287, 0.11, 0.68, 0.42, -4},
- {5, 2, (double) INT2_MAX, 0.264, 0.079, 0.45, 0.59, -7},
- {10, 1, (double) INT2_MAX, 0.309, 0.15, 0.88, 0.35, -3},
- {9, 1, (double) INT2_MAX, 0.294, 0.11, 0.61, 0.48, -6},
- {8, 1, (double) INT2_MAX, 0.270, 0.072, 0.40, 0.68, -10},
- };
- static Int4 pam30_prefs[PAM30_VALUES_MAX] = {
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_BEST,
- BLAST_MATRIX_NOMINAL,
- };
- #define PAM70_VALUES_MAX 7
- static double pam70_values[PAM70_VALUES_MAX][8] = {
- {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.3345, 0.229, 1.029, 0.3250, -0.7},
- {8, 2, (double) INT2_MAX, 0.301, 0.12, 0.54, 0.56, -5},
- {7, 2, (double) INT2_MAX, 0.286, 0.093, 0.43, 0.67, -7},
- {6, 2, (double) INT2_MAX, 0.264, 0.064, 0.29, 0.90, -12},
- {11, 1, (double) INT2_MAX, 0.305, 0.12, 0.52, 0.59, -6},
- {10, 1, (double) INT2_MAX, 0.291, 0.091, 0.41, 0.71, -9},
- {9, 1, (double) INT2_MAX, 0.270, 0.060, 0.28, 0.97, -14},
- };
- static Int4 pam70_prefs[PAM70_VALUES_MAX] = {
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_BEST,
- BLAST_MATRIX_NOMINAL
- };
- #define BLOSUM62_20_VALUES_MAX 65
- static double blosum62_20_values[BLOSUM62_20_VALUES_MAX][8] = {
- {(double) INT2_MAX, (double) INT2_MAX, (double) INT2_MAX, 0.03391, 0.125, 0.4544, 0.07462, -3.2},
- {100, 12, (double) INT2_MAX, 0.0300, 0.056, 0.21, 0.14, -15},
- {95, 12, (double) INT2_MAX, 0.0291, 0.047, 0.18, 0.16, -20},
- {90, 12, (double) INT2_MAX, 0.0280, 0.038, 0.15, 0.19, -28},
- {85, 12, (double) INT2_MAX, 0.0267, 0.030, 0.13, 0.21, -31},
- {80, 12, (double) INT2_MAX, 0.0250, 0.021, 0.10, 0.25, -39},
- {105, 11, (double) INT2_MAX, 0.0301, 0.056, 0.22, 0.14, -16},
- {100, 11, (double) INT2_MAX, 0.0294, 0.049, 0.20, 0.15, -17},
- {95, 11, (double) INT2_MAX, 0.0285, 0.042, 0.16, 0.18, -25},
- {90, 11, (double) INT2_MAX, 0.0271, 0.031, 0.14, 0.20, -28},
- {85, 11, (double) INT2_MAX, 0.0256, 0.023, 0.10, 0.26, -46},
- {115, 10, (double) INT2_MAX, 0.0308, 0.062, 0.22, 0.14, -20},
- {110, 10, (double) INT2_MAX, 0.0302, 0.056, 0.19, 0.16, -26},
- {105, 10, (double) INT2_MAX, 0.0296, 0.050, 0.17, 0.17, -27},
- {100, 10, (double) INT2_MAX, 0.0286, 0.041, 0.15, 0.19, -32},
- {95, 10, (double) INT2_MAX, 0.0272, 0.030, 0.13, 0.21, -35},
- {90, 10, (double) INT2_MAX, 0.0257, 0.022, 0.11, 0.24, -40},
- {85, 10, (double) INT2_MAX, 0.0242, 0.017, 0.083, 0.29, -51},
- {115, 9, (double) INT2_MAX, 0.0306, 0.061, 0.24, 0.13, -14},
- {110, 9, (double) INT2_MAX, 0.0299, 0.053, 0.19, 0.16, -23},
- {105, 9, (double) INT2_MAX, 0.0289, 0.043, 0.17, 0.17, -23},
- {100, 9, (double) INT2_MAX, 0.0279, 0.036, 0.14, 0.20, -31},
- {95, 9, (double) INT2_MAX, 0.0266, 0.028, 0.12, 0.23, -37},
- {120, 8, (double) INT2_MAX, 0.0307, 0.062, 0.22, 0.14, -18},
- {115, 8, (double) INT2_MAX, 0.0300, 0.053, 0.20, 0.15, -19},
- {110, 8, (double) INT2_MAX, 0.0292, 0.046, 0.17, 0.17, -23},
- {105, 8, (double) INT2_MAX, 0.0280, 0.035, 0.14, 0.20, -31},
- {100, 8, (double) INT2_MAX, 0.0266, 0.026, 0.12, 0.23, -37},
- {125, 7, (double) INT2_MAX, 0.0306, 0.058, 0.22, 0.14, -18},
- {120, 7, (double) INT2_MAX, 0.0300, 0.052, 0.19, 0.16, -23},
- {115, 7, (double) INT2_MAX, 0.0292, 0.044, 0.17, 0.17, -24},
- {110, 7, (double) INT2_MAX, 0.0279, 0.032, 0.14, 0.20, -31},
- {105, 7, (double) INT2_MAX, 0.0267, 0.026, 0.11, 0.24, -41},
- {120,10,5, 0.0298, 0.049, 0.19, 0.16, -21},
- {115,10,5, 0.0290, 0.042, 0.16, 0.18, -25},
- {110,10,5, 0.0279, 0.033, 0.13, 0.21, -32},
- {105,10,5, 0.0264, 0.024, 0.10, 0.26, -46},
- {100,10,5, 0.0250, 0.018, 0.081, 0.31, -56},
- {125,10,4, 0.0301, 0.053, 0.18, 0.17, -25},
- {120,10,4, 0.0292, 0.043, 0.15, 0.20, -33},
- {115,10,4, 0.0282, 0.035, 0.13, 0.22, -36},
- {110,10,4, 0.0270, 0.027, 0.11, 0.25, -41},
- {105,10,4, 0.0254, 0.020, 0.079, 0.32, -60},
- {130,10,3, 0.0300, 0.051, 0.17, 0.18, -27},
- {125,10,3, 0.0290, 0.040, 0.13, 0.22, -38},
- {120,10,3, 0.0278, 0.030, 0.11, 0.25, -44},
- {115,10,3, 0.0267, 0.025, 0.092, 0.29, -52},
- {110,10,3, 0.0252, 0.018, 0.070, 0.36, -70},
- {135,10,2, 0.0292, 0.040, 0.13, 0.22, -35},
- {130,10,2, 0.0283, 0.034, 0.10, 0.28, -51},
- {125,10,2, 0.0269, 0.024, 0.077, 0.35, -71},
- {120,10,2, 0.0253, 0.017, 0.059, 0.43, -90},
- {115,10,2, 0.0234, 0.011, 0.043, 0.55, -121},
- {100,14,3, 0.0258, 0.023, 0.087, 0.33, -59},
- {105,13,3, 0.0263, 0.024, 0.085, 0.31, -57},
- {110,12,3, 0.0271, 0.028, 0.093, 0.29, -54},
- {115,11,3, 0.0275, 0.030, 0.10, 0.27, -49},
- {125,9,3, 0.0283, 0.034, 0.12, 0.23, -38},
- {130,8,3, 0.0287, 0.037, 0.12, 0.23, -40},
- {125,7,3, 0.0287, 0.036, 0.12, 0.24, -44},
- {140,6,3, 0.0285, 0.033, 0.12, 0.23, -40},
- {105,14,3, 0.0270, 0.028, 0.10, 0.27, -46},
- {110,13,3, 0.0279, 0.034, 0.10, 0.27, -50},
- {115,12,3, 0.0282, 0.035, 0.12, 0.24, -42},
- {120,11,3, 0.0286, 0.037, 0.12, 0.24, -44},
- };
- static Int4 blosum62_20_prefs[BLOSUM62_20_VALUES_MAX] = {
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_BEST,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL,
- BLAST_MATRIX_NOMINAL
- };
- /*
- Allocates memory for the BlastScoreBlk*.
- */
- BlastScoreBlk*
- BlastScoreBlkNew(Uint1 alphabet, Int4 number_of_contexts)
- {
- BlastScoreBlk* sbp;
- sbp = (BlastScoreBlk*) calloc(1, sizeof(BlastScoreBlk));
- if (sbp != NULL)
- {
- sbp->alphabet_code = alphabet;
- if (alphabet != BLASTNA_SEQ_CODE)
- sbp->alphabet_size = BLASTAA_SIZE;
- else
- sbp->alphabet_size = BLASTNA_SIZE;
- /* set the alphabet type (protein or not). */
- switch (alphabet) {
- case BLASTAA_SEQ_CODE:
- sbp->protein_alphabet = TRUE;
- break;
- case BLASTNA_SEQ_CODE:
- sbp->protein_alphabet = FALSE;
- break;
- default:
- break;
- }
- sbp->matrix_struct = BlastMatrixAllocate(sbp->alphabet_size);
- if (sbp->matrix_struct == NULL)
- {
- sbp = BlastScoreBlkFree(sbp);
- return sbp;
- }
- sbp->matrix = sbp->matrix_struct->matrix;
- sbp->maxscore = (Int4 *) calloc(BLAST_MATRIX_SIZE, sizeof(Int4));
- sbp->scale_factor = 1.0;
- sbp->number_of_contexts = number_of_contexts;
- sbp->sfp = (Blast_ScoreFreq**)
- calloc(sbp->number_of_contexts, sizeof(Blast_ScoreFreq*));
- sbp->kbp_std = (Blast_KarlinBlk**)
- calloc(sbp->number_of_contexts, sizeof(Blast_KarlinBlk*));
- sbp->kbp_gap_std = (Blast_KarlinBlk**)
- calloc(sbp->number_of_contexts, sizeof(Blast_KarlinBlk*));
- sbp->kbp_psi = (Blast_KarlinBlk**)
- calloc(sbp->number_of_contexts, sizeof(Blast_KarlinBlk*));
- sbp->kbp_gap_psi = (Blast_KarlinBlk**)
- calloc(sbp->number_of_contexts, sizeof(Blast_KarlinBlk*));
- }
- return sbp;
- }
- Blast_ScoreFreq*
- Blast_ScoreFreqDestruct(Blast_ScoreFreq* sfp)
- {
- if (sfp == NULL)
- return NULL;
- if (sfp->sprob0 != NULL)
- sfree(sfp->sprob0);
- sfree(sfp);
- return sfp;
- }
- /*
- Deallocates the Karlin Block.
- */
- Blast_KarlinBlk*
- Blast_KarlinBlkDestruct(Blast_KarlinBlk* kbp)
- {
- sfree(kbp);
- return kbp;
- }
- static SBLASTMatrixStructure*
- BlastMatrixDestruct(SBLASTMatrixStructure* matrix_struct)
- {
- if (matrix_struct == NULL)
- return NULL;
- sfree(matrix_struct);
- return matrix_struct;
- }
- BlastScoreBlk*
- BlastScoreBlkFree(BlastScoreBlk* sbp)
- {
- Int4 index, rows;
- if (sbp == NULL)
- return NULL;
-
- for (index=0; index<sbp->number_of_contexts; index++) {
- if (sbp->sfp)
- sbp->sfp[index] = Blast_ScoreFreqDestruct(sbp->sfp[index]);
- if (sbp->kbp_std)
- sbp->kbp_std[index] = Blast_KarlinBlkDestruct(sbp->kbp_std[index]);
- if (sbp->kbp_gap_std)
- sbp->kbp_gap_std[index] = Blast_KarlinBlkDestruct(sbp->kbp_gap_std[index]);
- if (sbp->kbp_psi)
- sbp->kbp_psi[index] = Blast_KarlinBlkDestruct(sbp->kbp_psi[index]);
- if (sbp->kbp_gap_psi)
- sbp->kbp_gap_psi[index] = Blast_KarlinBlkDestruct(sbp->kbp_gap_psi[index]);
- }
- if (sbp->kbp_ideal)
- sbp->kbp_ideal = Blast_KarlinBlkDestruct(sbp->kbp_ideal);
- sfree(sbp->sfp);
- sfree(sbp->kbp_std);
- sfree(sbp->kbp_psi);
- sfree(sbp->kbp_gap_std);
- sfree(sbp->kbp_gap_psi);
- sbp->matrix_struct = BlastMatrixDestruct(sbp->matrix_struct);
- sfree(sbp->maxscore);
- sbp->comments = ListNodeFreeData(sbp->comments);
- sfree(sbp->name);
- sfree(sbp->ambiguous_res);
-
- /* Removing posMatrix and posFreqs if any */
- rows = sbp->query_length + 1;
- if(sbp->posMatrix != NULL) {
- for (index=0; index < rows; index++) {
- sfree(sbp->posMatrix[index]);
- }
- sfree(sbp->posMatrix);
- }
-
- if(sbp->posFreqs != NULL) {
- for (index = 0; index < rows; index++) {
- sfree(sbp->posFreqs[index]);
- }
- sfree(sbp->posFreqs);
- }
-
- sfree(sbp);
-
- return NULL;
- }
- /*
- Set the ambiguous residue (e.g, 'N', 'X') in the BlastScoreBlk*.
- Convert from ncbieaa to sbp->alphabet_code (i.e., ncbistdaa) first.
- */
- Int2
- BLAST_ScoreSetAmbigRes(BlastScoreBlk* sbp, char ambiguous_res)
- {
- Int2 index;
- Uint1* ambig_buffer;
- if (sbp == NULL)
- return 1;
- if (sbp->ambig_occupy >= sbp->ambig_size)
- {
- sbp->ambig_size += 5;
- ambig_buffer = (Uint1 *) calloc(sbp->ambig_size, sizeof(Uint1));
- for (index=0; index<sbp->ambig_occupy; index++)
- {
- ambig_buffer[index] = sbp->ambiguous_res[index];
- }
- sfree(sbp->ambiguous_res);
- sbp->ambiguous_res = ambig_buffer;
- }
- if (sbp->alphabet_code == BLASTAA_SEQ_CODE)
- {
- sbp->ambiguous_res[sbp->ambig_occupy] =
- AMINOACID_TO_NCBISTDAA[toupper(ambiguous_res)];
- }
- else {
- if (sbp->alphabet_code == BLASTNA_SEQ_CODE)
- sbp->ambiguous_res[sbp->ambig_occupy] =
- IUPACNA_TO_BLASTNA[toupper(ambiguous_res)];
- else if (sbp->alphabet_code == NCBI4NA_SEQ_CODE)
- sbp->ambiguous_res[sbp->ambig_occupy] =
- IUPACNA_TO_NCBI4NA[toupper(ambiguous_res)];
- }
- (sbp->ambig_occupy)++;
-
- return 0;
- }
- /*
- Fill in the matrix for blastn using the penaly and rewards
- The query sequence alphabet is blastna, the subject sequence
- is ncbi2na. The alphabet blastna is defined in blastkar.h
- and the first four elements of blastna are identical to ncbi2na.
- The query is in the first index, the subject is the second.
- if matrix==NULL, it is allocated and returned.
- */
- static Int4 **BlastScoreBlkMatCreateEx(Int4 **matrix,Int4 penalty,
- Int4 reward)
- {
- Int2 index1, index2, degen;
- Int2 degeneracy[BLASTNA_SIZE+1];
-
- if(!matrix) {
- SBLASTMatrixStructure* matrix_struct;
- matrix_struct =BlastMatrixAllocate((Int2) BLASTNA_SIZE);
- matrix = matrix_struct->matrix;
- }
- for (index1 = 0; index1<BLASTNA_SIZE; index1++) /* blastna */
- for (index2 = 0; index2<BLASTNA_SIZE; index2++) /* blastna */
- matrix[index1][index2] = 0;
- /* In blastna the 1st four bases are A, C, G, and T, exactly as it is ncbi2na. */
- /* ncbi4na gives them the value 1, 2, 4, and 8. */
- /* Set the first four bases to degen. one */
- for (index1=0; index1<NUMBER_NON_AMBIG_BP; index1++)
- degeneracy[index1] = 1;
- for (index1=NUMBER_NON_AMBIG_BP; index1<BLASTNA_SIZE; index1++) /* blastna */
- {
- degen=0;
- for (index2=0; index2<NUMBER_NON_AMBIG_BP; index2++) /* ncbi2na */
- {
- if (BLASTNA_TO_NCBI4NA[index1] & BLASTNA_TO_NCBI4NA[index2])
- degen++;
- }
- degeneracy[index1] = degen;
- }
- for (index1=0; index1<BLASTNA_SIZE; index1++) /* blastna */
- {
- for (index2=index1; index2<BLASTNA_SIZE; index2++) /* blastna */
- {
- if (BLASTNA_TO_NCBI4NA[index1] & BLASTNA_TO_NCBI4NA[index2])
- { /* round up for positive scores, down for negatives. */
- matrix[index1][index2] = BLAST_Nint( (double) ((degeneracy[index2]-1)*penalty + reward))/degeneracy[index2];
- if (index1 != index2)
- {
- matrix[index2][index1] = matrix[index1][index2];
- }
- }
- else
- {
- matrix[index1][index2] = penalty;
- matrix[index2][index1] = penalty;
- }
- }
- }
- /* The value of 15 is a gap, which is a sentinel between strands in
- the ungapped extension algorithm */
- for (index1=0; index1<BLASTNA_SIZE; index1++) /* blastna */
- matrix[BLASTNA_SIZE-1][index1] = INT4_MIN / 2;
- for (index1=0; index1<BLASTNA_SIZE; index1++) /* blastna */
- matrix[index1][BLASTNA_SIZE-1] = INT4_MIN / 2;
- return matrix;
- }
- /*
- Fill in the matrix for blastn using the penaly and rewards on
- the BlastScoreBlk*.
- The query sequence alphabet is blastna, the subject sequence
- is ncbi2na. The alphabet blastna is defined in blastkar.h
- and the first four elements of blastna are identical to ncbi2na.
- The query is in the first index, the subject is the second.
- */
- static Int2 BlastScoreBlkMatCreate(BlastScoreBlk* sbp)
- {
- sbp->matrix = BlastScoreBlkMatCreateEx(sbp->matrix,sbp->penalty, sbp->reward);
- sbp->mat_dim1 = BLASTNA_SIZE;
- sbp->mat_dim2 = BLASTNA_SIZE;
- return 0;
- }
- /*
- Read in the matrix from the FILE *fp.
- This function ASSUMES that the matrices are in the ncbistdaa
- format. BLAST should be able to use any alphabet, though it
- is expected that ncbistdaa will be used.
- */
- static Int2
- BlastScoreBlkMatRead(BlastScoreBlk* sbp, FILE *fp)
- {
- char buf[512+3];
- char temp[512];
- char* cp,* lp;
- char ch;
- Int4 ** matrix;
- Int4 * m;
- Int4 score;
- Uint4 a1cnt = 0, a2cnt = 0;
- char a1chars[BLAST_MAX_ALPHABET], a2chars[BLAST_MAX_ALPHABET];
- long lineno = 0;
- double xscore;
- register int index1, index2;
- Int2 status;
-
- matrix = sbp->matrix;
-
- if (sbp->alphabet_code != BLASTNA_SEQ_CODE) {
- for (index1 = 0; index1 < sbp->alphabet_size; index1++)
- for (index2 = 0; index2 < sbp->alphabet_size; index2++)
- matrix[index1][index2] = BLAST_SCORE_MIN;
- } else {
- /* Fill-in all the defaults ambiguity and normal codes */
- status=BlastScoreBlkMatCreate(sbp);
- if(status != 0)
- {
- return status;
- }
- }
-
- /* Read the residue names for the second alphabet */
- while (fgets(buf, sizeof(buf), fp) != NULL) {
- ++lineno;
- if (strchr(buf, 'n') == NULL) {
- return 2;
- }
- if (buf[0] == COMMENT_CHR) {
- /* save the comment line in a linked list */
- *strchr(buf, 'n') = NULLB;
- ListNodeCopyStr(&sbp->comments, 0, buf+1);
- continue;
- }
- if ((cp = strchr(buf, COMMENT_CHR)) != NULL)
- *cp = NULLB;
- lp = (char*)strtok(buf, TOKSTR);
- if (lp == NULL) /* skip blank lines */
- continue;
- while (lp != NULL) {
- if (sbp->alphabet_code == BLASTAA_SEQ_CODE)
- ch = AMINOACID_TO_NCBISTDAA[toupper(*lp)];
- else if (sbp->alphabet_code == BLASTNA_SEQ_CODE) {
- ch = IUPACNA_TO_BLASTNA[toupper(*lp)];
- } else {
- ch = *lp;
- }
- a2chars[a2cnt++] = ch;
- lp = (char*)strtok(NULL, TOKSTR);
- }
-
- break; /* Exit loop after reading one line. */
- }
-
- if (a2cnt <= 1) {
- return 2;
- }
- if (sbp->alphabet_code != BLASTNA_SEQ_CODE) {
- sbp->mat_dim2 = a2cnt;
- }
- while (fgets(buf, sizeof(buf), fp) != NULL) {
- ++lineno;
- if ((cp = strchr(buf, 'n')) == NULL) {
- return 2;
- }
- if ((cp = strchr(buf, COMMENT_CHR)) != NULL)
- *cp = NULLB;
- if ((lp = (char*)strtok(buf, TOKSTR)) == NULL)
- continue;
- ch = *lp;
- cp = (char*) lp;
- if ((cp = strtok(NULL, TOKSTR)) == NULL) {
- return 2;
- }
- if (a1cnt >= DIM(a1chars)) {
- return 2;
- }
- if (sbp->alphabet_code == BLASTAA_SEQ_CODE) {
- ch = AMINOACID_TO_NCBISTDAA[toupper(ch)];
- } else {
- if (sbp->alphabet_code == BLASTNA_SEQ_CODE) {
- ch = IUPACNA_TO_BLASTNA[toupper(ch)];
- }
- }
- a1chars[a1cnt++] = ch;
- m = &matrix[(int)ch][0];
- index2 = 0;
- while (cp != NULL) {
- if (index2 >= (int) a2cnt) {
- return 2;
- }
- strcpy(temp, cp);
-
- if (strcasecmp(temp, "na") == 0) {
- score = BLAST_SCORE_1MIN;
- } else {
- if (sscanf(temp, "%lg", &xscore) != 1) {
- return 2;
- }
- /*xscore = MAX(xscore, BLAST_SCORE_1MIN);*/
- if (xscore > BLAST_SCORE_1MAX || xscore < BLAST_SCORE_1MIN) {
- return 2;
- }
- xscore += (xscore >= 0. ? 0.5 : -0.5);
- score = (Int4)xscore;
- }
-
- m[(int)a2chars[index2++]] = score;
-
- cp = strtok(NULL, TOKSTR);
- }
- }
-
- if (a1cnt <= 1) {
- return 2;
- }
-
- if (sbp->alphabet_code != BLASTNA_SEQ_CODE) {
- sbp->mat_dim1 = a1cnt;
- }
-
- return 0;
- }
- static Int2
- BlastScoreBlkMaxScoreSet(BlastScoreBlk* sbp)
- {
- Int4 score, maxscore;
- Int4 ** matrix;
- Int2 index1, index2;
- sbp->loscore = BLAST_SCORE_1MAX;
- sbp->hiscore = BLAST_SCORE_1MIN;
- matrix = sbp->matrix;
- for (index1=0; index1<sbp->alphabet_size; index1++)
- {
- maxscore=BLAST_SCORE_MIN;
- for (index2=0; index2<sbp->alphabet_size; index2++)
- {
- score = matrix[index1][index2];
- if (score <= BLAST_SCORE_MIN || score >= BLAST_SCORE_MAX)
- continue;
- if (score > maxscore)
- {
- maxscore = score;
- }
- if (sbp->loscore > score)
- sbp->loscore = score;
- if (sbp->hiscore < score)
- sbp->hiscore = score;
- }
- sbp->maxscore[index1] = maxscore;
- }
- /* If the lo/hi-scores are BLAST_SCORE_MIN/BLAST_SCORE_MAX, (i.e., for
- gaps), then use other scores. */
- if (sbp->loscore < BLAST_SCORE_1MIN)
- sbp->loscore = BLAST_SCORE_1MIN;
- if (sbp->hiscore > BLAST_SCORE_1MAX)
- sbp->hiscore = BLAST_SCORE_1MAX;
- return 0;
- }
- Int2
- BlastScoreBlkMatrixLoad(BlastScoreBlk* sbp)
- {
- Int2 status = 0;
- SNCBIPackedScoreMatrix* psm;
- Int4** matrix = sbp->matrix;
- int i, j; /* loop indices */
- ASSERT(sbp);
- if (strcasecmp(sbp->name, "BLOSUM62") == 0) {
- psm = (SNCBIPackedScoreMatrix*) &NCBISM_Blosum62;
- } else if (strcasecmp(sbp->name, "BLOSUM45") == 0) {
- psm = (SNCBIPackedScoreMatrix*) &NCBISM_Blosum45;
- } else if (strcasecmp(sbp->name, "BLOSUM80") == 0) {
- psm = (SNCBIPackedScoreMatrix*) &NCBISM_Blosum80;
- } else if (strcasecmp(sbp->name, "PAM30") == 0) {
- psm = (SNCBIPackedScoreMatrix*) &NCBISM_Pam30;
- } else if (strcasecmp(sbp->name, "PAM70") == 0) {
- psm = (SNCBIPackedScoreMatrix*) &NCBISM_Pam70;
- } else {
- return -1;
- }
- /* Initialize with BLAST_SCORE_MIN */
- for (i = 0; i < sbp->alphabet_size; i++) {
- for (j = 0; j < sbp->alphabet_size; j++) {
- matrix[i][j] = BLAST_SCORE_MIN;
- }
- }
- for (i = 0; i < sbp->alphabet_size; i++) {
- for (j = 0; j < sbp->alphabet_size; j++) {
- /* skip selenocysteine and gap */
- if (i == AMINOACID_TO_NCBISTDAA['U'] ||
- i == AMINOACID_TO_NCBISTDAA['-'] ||
- j == AMINOACID_TO_NCBISTDAA['U'] ||
- j == AMINOACID_TO_NCBISTDAA['-']) {
- continue;
- }
- matrix[i][j] = NCBISM_GetScore((const SNCBIPackedScoreMatrix*) psm,
- i, j);
- }
- }
- /* Sets dimensions of matrix. */
- sbp->mat_dim1 = sbp->mat_dim2 = sbp->alphabet_size;
- return status;
- }
- Int2
- BLAST_ScoreBlkMatFill(BlastScoreBlk* sbp, char* matrix_path)
- {
- Int2 status = 0;
-
- if (sbp->read_in_matrix) {
-
- if (matrix_path && *matrix_path != NULLB) {
- FILE *fp = NULL;
- char* full_matrix_path = NULL;
- int path_len = strlen(matrix_path);
- int buflen = path_len + strlen(sbp->name);
- full_matrix_path = (char*) malloc((buflen + 1) * sizeof(char));
- if (!full_matrix_path) {
- return -1;
- }
- strncpy(full_matrix_path, matrix_path, buflen);
- strncat(full_matrix_path, sbp->name, buflen - path_len);
- if ( (fp=fopen(full_matrix_path, "r")) == NULL) {
- return -1;
- }
- sfree(full_matrix_path);
- if ( (status=BlastScoreBlkMatRead(sbp, fp)) != 0) {
- fclose(fp);
- return status;
- }
- fclose(fp);
- } else {
- if ( (status = BlastScoreBlkMatrixLoad(sbp)) !=0) {
- return status;
- }
- }
- } else {
- /* Nucleotide BLAST only! */
- if ( (status=BlastScoreBlkMatCreate(sbp)) != 0)
- return status;
- }
-
- if ( (status=BlastScoreBlkMaxScoreSet(sbp)) != 0)
- return status;
- return status;
- }
- Blast_ResFreq*
- Blast_ResFreqDestruct(Blast_ResFreq* rfp)
- {
- if (rfp == NULL)
- return NULL;
- if (rfp->prob0 != NULL)
- sfree(rfp->prob0);
- sfree(rfp);
- return rfp;
- }
- /*
- Allocates the Blast_ResFreq* and then fills in the frequencies
- in the probabilities.
- */
- Blast_ResFreq*
- Blast_ResFreqNew(const BlastScoreBlk* sbp)
- {
- Blast_ResFreq* rfp;
- if (sbp == NULL)
- {
- return NULL;
- }
- rfp = (Blast_ResFreq*) calloc(1, sizeof(Blast_ResFreq));
- if (rfp == NULL)
- return NULL;
- rfp->alphabet_code = sbp->alphabet_code;
- rfp->prob0 = (double*) calloc(sbp->alphabet_size, sizeof(double));
- if (rfp->prob0 == NULL)
- {
- rfp = Blast_ResFreqDestruct(rfp);
- return rfp;
- }
- rfp->prob = rfp->prob0 - sbp->alphabet_start;
- return rfp;
- }
- typedef struct BLAST_LetterProb {
- char ch;
- double p;
- } BLAST_LetterProb;
- #if 0
- /* Unused for right now, but do not remove */
- /* M. O. Dayhoff amino acid background frequencies */
- static BLAST_LetterProb Dayhoff_prob[] = {
- { 'A', 87.13 },
- { 'C', 33.47 },
- { 'D', 46.87 },
- { 'E', 49.53 },
- { 'F', 39.77 },
- { 'G', 88.61 },
- { 'H', 33.62 },
- { 'I', 36.89 },
- { 'K', 80.48 },
- { 'L', 85.36 },
- { 'M', 14.75 },
- { 'N', 40.43 },
- { 'P', 50.68 },
- { 'Q', 38.26 },
- { 'R', 40.90 },
- { 'S', 69.58 },
- { 'T', 58.54 },
- { 'V', 64.72 },
- { 'W', 10.49 },
- { 'Y', 29.92 }
- };
- /* Stephen Altschul amino acid background frequencies */
- static BLAST_LetterProb Altschul_prob[] = {
- { 'A', 81.00 },
- { 'C', 15.00 },
- { 'D', 54.00 },
- { 'E', 61.00 },
- { 'F', 40.00 },
- { 'G', 68.00 },
- { 'H', 22.00 },
- { 'I', 57.00 },
- { 'K', 56.00 },
- { 'L', 93.00 },
- { 'M', 25.00 },
- { 'N', 45.00 },
- { 'P', 49.00 },
- { 'Q', 39.00 },
- { 'R', 57.00 },
- { 'S', 68.00 },
- { 'T', 58.00 },
- { 'V', 67.00 },
- { 'W', 13.00 },
- { 'Y', 32.00 }
- };
- #endif
- /* amino acid background frequencies from Robinson and Robinson */
- static BLAST_LetterProb Robinson_prob[] = {
- { 'A', 78.05 },
- { 'C', 19.25 },
- { 'D', 53.64 },
- { 'E', 62.95 },
- { 'F', 38.56 },
- { 'G', 73.77 },
- { 'H', 21.99 },
- { 'I', 51.42 },
- { 'K', 57.44 },
- { 'L', 90.19 },
- { 'M', 22.43 },
- { 'N', 44.87 },
- { 'P', 52.03 },
- { 'Q', 42.64 },
- { 'R', 51.29 },
- { 'S', 71.20 },
- { 'T', 58.41 },
- { 'V', 64.41 },
- { 'W', 13.30 },
- { 'Y', 32.16 }
- };
- #define STD_AMINO_ACID_FREQS Robinson_prob
- static BLAST_LetterProb nt_prob[] = {
- { 'A', 25.00 },
- { 'C', 25.00 },
- { 'G', 25.00 },
- { 'T', 25.00 }
- };
- /*
- Normalize the frequencies to "norm".
- */
- static Int2
- Blast_ResFreqNormalize(const BlastScoreBlk* sbp, Blast_ResFreq* rfp, double norm)
- {
- Int2 alphabet_stop, index;
- double sum = 0., p;
- if (norm == 0.)
- return 1;
- alphabet_stop = sbp->alphabet_start + sbp->alphabet_size;
- for (index=sbp->alphabet_start; index<alphabet_stop; index++)
- {
- p = rfp->prob[index];
- if (p < 0.)
- return 1;
- sum += p;
- }
- if (sum <= 0.)
- return 0;
- for (index=sbp->alphabet_start; index<alphabet_stop; index++)
- {
- rfp->prob[index] /= sum;
- rfp->prob[index] *= norm;
- }
- return 0;
- }
- Int2
- Blast_GetStdAlphabet(Uint1 alphabet_code, Uint1* residues, Uint4 residues_size)
- {
- Int2 index;
- if (residues_size < DIM(STD_AMINO_ACID_FREQS))
- return -2;
- for (index=0; index<(int)DIM(STD_AMINO_ACID_FREQS); index++)
- {
- if (alphabet_code == BLASTAA_SEQ_CODE)
- {
- residues[index] =
- AMINOACID_TO_NCBISTDAA[toupper(STD_AMINO_ACID_FREQS[index].ch)];
- }
- else
- {
- residues[index] = STD_AMINO_ACID_FREQS[index].ch;
- }
- }
- return index;
- }
- Int2
- Blast_ResFreqStdComp(const BlastScoreBlk* sbp, Blast_ResFreq* rfp)
- {
- Int2 retval;
- Uint4 index;
- Uint1* residues;
- if (sbp->protein_alphabet == TRUE)
- {
- residues = (Uint1*) calloc(DIM(STD_AMINO_ACID_FREQS), sizeof(Uint1));
- retval = Blast_GetStdAlphabet(sbp->alphabet_code, residues, DIM(STD_AMINO_ACID_FREQS));
- if (retval < 1)
- return retval;
- for (index=0; index<DIM(STD_AMINO_ACID_FREQS); index++)
- {
- rfp->prob[residues[index]] = STD_AMINO_ACID_FREQS[index].p;
- }
- sfree(residues);
- }
- else
- { /* beginning of blastna and ncbi2na are the same. */
- /* Only blastna used for nucleotides. */
- for (index=0; index<DIM(nt_prob); index++)
- {
- rfp->prob[index] = nt_prob[index].p;
- }
- }
- Blast_ResFreqNormalize(sbp, rfp, 1.0);
- return 0;
- }
- typedef struct Blast_ResComp {
- Uint1 alphabet_code;
- Int4* comp; /* composition of alphabet, array starts at beginning of alphabet. */
- Int4* comp0; /* Same array as above, starts at zero. */
- } Blast_ResComp;
- static Blast_ResComp*
- BlastResCompDestruct(Blast_ResComp* rcp)
- {
- if (rcp == NULL)
- return NULL;
- if (rcp->comp0 != NULL)
- sfree(rcp->comp0);
- sfree(rcp);
- return NULL;
- }
- /*
- Allocated the Blast_ResComp* for a given alphabet. Only the
- alphabets ncbistdaa and ncbi4na should be used by BLAST.
- */
- static Blast_ResComp*
- BlastResCompNew(BlastScoreBlk* sbp)
- {
- Blast_ResComp* rcp;
- rcp = (Blast_ResComp*) calloc(1, sizeof(Blast_ResComp));
- if (rcp == NULL)
- return NULL;
- rcp->alphabet_code = sbp->alphabet_code;
- /* comp0 has zero offset, comp starts at 0, only one
- array is allocated. */
- rcp->comp0 = (Int4*) calloc(BLAST_MATRIX_SIZE, sizeof(Int4));
- if (rcp->comp0 == NULL)
- {
- rcp = BlastResCompDestruct(rcp);
- return rcp;
- }
- rcp->comp = rcp->comp0 - sbp->alphabet_start;
- return rcp;
- }
- /*
- Store the composition of a (query) string.
- */
- static Int2
- BlastResCompStr(BlastScoreBlk* sbp, Blast_ResComp* rcp, char* str, Int4 length)
- {
- char* lp,* lpmax;
- Int2 index;
- Uint1 mask;
- if (sbp == NULL || rcp == NULL || str == NULL)
- return 1;
- if (rcp->alphabet_code != sbp->alphabet_code)
- return 1;
- /* For megablast, check only the first 4 bits of the sequence values */
- mask = (sbp->protein_alphabet ? 0xff : 0x0f);
- /* comp0 starts at zero and extends for "num", comp is the same array, but
- "start_at" offset. */
- for (index=0; index<(sbp->alphabet_size); index++)
- rcp->comp0[index] = 0;
- for (lp = str, lpmax = lp+length; lp < lpmax; lp++)
- {
- ++rcp->comp[(int)(*lp & mask)];
- }
- /* Don't count ambig. residues. */
- for (index=0; index<sbp->ambig_occupy; index++)
- {
- rcp->comp[sbp->ambiguous_res[index]] = 0;
- }
- return 0;
- }
- static Int2
- Blast_ResFreqClr(const BlastScoreBlk* sbp, Blast_ResFreq* rfp)
- {
- Int2 alphabet_max, index;
-
- alphabet_max = sbp->alphabet_start + sbp->alphabet_size;
- for (index=sbp->alphabet_start; index<alphabet_max; index++)
- rfp->prob[index] = 0.0;
-
- return 0;
- }
- /*
- Calculate the residue frequencies associated with the provided ResComp
- */
- static Int2
- Blast_ResFreqResComp(BlastScoreBlk* sbp, Blast_ResFreq* rfp,
- Blast_ResComp* rcp)
- {
- Int2 alphabet_max, index;
- double sum = 0.;
- if (rfp == NULL || rcp == NULL)
- return 1;
- if (rfp->alphabet_code != rcp->alphabet_code)
- return 1;
- alphabet_max = sbp->alphabet_start + sbp->alphabet_size;
- for (index=sbp->alphabet_start; index<alphabet_max; index++)
- sum += rcp->comp[index];
- if (sum == 0.) {
- Blast_ResFreqClr(sbp, rfp);
- return 0;
- }
- for (index=sbp->alphabet_start; index<alphabet_max; index++)
- rfp->prob[index] = rcp->comp[index] / sum;
- return 0;
- }
- static Int2
- Blast_ResFreqString(BlastScoreBlk* sbp, Blast_ResFreq* rfp, char* string, Int4 length)
- {
- Blast_ResComp* rcp;
-
- rcp = BlastResCompNew(sbp);
- BlastResCompStr(sbp, rcp, string, length);
- Blast_ResFreqResComp(sbp, rfp, rcp);
- rcp = BlastResCompDestruct(rcp);
- return 0;
- }
- static Int2
- BlastScoreChk(Int4 lo, Int4 hi)
- {
- if (lo >= 0 || hi <= 0 ||
- lo < BLAST_SCORE_1MIN || hi > BLAST_SCORE_1MAX)
- return 1;
- if (hi - lo > BLAST_SCORE_RANGE_MAX)
- return 1;
- return 0;
- }
- Blast_ScoreFreq*
- Blast_ScoreFreqNew(Int4 score_min, Int4 score_max)
- {
- Blast_ScoreFreq* sfp;
- Int4 range;
- if (BlastScoreChk(score_min, score_max) != 0)
- return NULL;
- sfp = (Blast_ScoreFreq*) calloc(1, sizeof(Blast_ScoreFreq));
- if (sfp == NULL)
- return NULL;
- range = score_max - score_min + 1;
- sfp->sprob = (double*) calloc(range, sizeof(double));
- if (sfp->sprob == NULL)
- {
- Blast_ScoreFreqDestruct(sfp);
- return NULL;
- }
- sfp->sprob0 = sfp->sprob;
- sfp->sprob -= score_min; /* center around 0 */
- sfp->score_min = score_min;
- sfp->score_max = score_max;
- sfp->obs_min = sfp->obs_max = 0;
- sfp->score_avg = 0.0;
- return sfp;
- }
- static Int2
- BlastScoreFreqCalc(BlastScoreBlk* sbp, Blast_ScoreFreq* sfp, Blast_ResFreq* rfp1, Blast_ResFreq* rfp2)
- {
- Int4 ** matrix;
- Int4 score, obs_min, obs_max;
- double score_sum, score_avg;
- Int2 alphabet_start, alphabet_end, index1, index2;
- if (sbp == NULL || sfp == NULL)
- return 1;
- if (sbp->loscore < sfp->score_min || sbp->hiscore > sfp->score_max)
- return 1;
- for (score = sfp->score_min; score <= sfp->score_max; score++)
- sfp->sprob[score] = 0.0;
- matrix = sbp->matrix;
- alphabet_start = sbp->alphabet_start;
- alphabet_end = alphabet_start + sbp->alphabet_size;
- for (index1=alphabet_start; index1<alphabet_end; index1++)
- {
- for (index2=alphabet_start; index2<alphabet_end; index2++)
- {
- score = matrix[index1][index2];
- if (score >= sbp->loscore)
- {
- sfp->sprob[score] += rfp1->prob[index1] * rfp2->prob[index2];
- }
- }
- }
- score_sum = 0.;
- obs_min = obs_max = BLAST_SCORE_MIN;
- for (score = sfp->score_min; score <= sfp->score_max; score++)
- {
- if (sfp->sprob[score] > 0.)
- {
- score_sum += sfp->sprob[score];
- obs_max = score;
- if (obs_min == BLAST_SCORE_MIN)
- obs_min = score;
- }
- }
- sfp->obs_min = obs_min;
- sfp->obs_max = obs_max;
- score_avg = 0.0;
- if (score_sum > 0.0001 || score_sum < -0.0001)
- {
- for (score = obs_min; score <= obs_max; score++)
- {
- sfp->sprob[score] /= score_sum;
- score_avg += score * sfp->sprob[score];
- }
- }
- sfp->score_avg = score_avg;
- return 0;
- }
- #define DIMOFP0 (iterlimit*range + 1)
- #define DIMOFP0_MAX (BLAST_KARLIN_K_ITER_MAX*BLAST_SCORE_RANGE_MAX+1)
- #define SMALL_LAMBDA_THRESHOLD 20 /*defines special case in K computation*/
- /*threshold is on exp(-Lambda)*/
- /*The following procedure computes K. The input includes Lambda, H,
- * and an array of probabilities for each score.
- * There are distinct closed form for three cases:
- * 1. high score is 1 low score is -1
- * 2. high score is 1 low score is not -1
- * 3. low score is -1, high score is not 1
- *
- * Otherwise, in most cases the value is computed as:
- * -exp(-2.0*outerSum) / ((H/lambda)*(exp(-lambda) - 1)
- * The last term (exp(-lambda) - 1) can be computed in two different
- * ways depending on whether lambda is small or not.
- * outerSum is a sum of the terms
- * innerSum/j, where j is denoted by iterCounter in the code.
- * The sum is truncated when the new term innersum/j i sufficiently small.
- * innerSum is a weighted sum of the probabilities of
- * of achieving a total score i in a gapless alignment,
- * which we denote by P(i,j).
- * of exactly j characters. innerSum(j) has two parts
- * Sum over i < 0 P(i,j)exp(-i * lambda) +
- * Sum over i >=0 P(i,j)
- * The terms P(i,j) are computed by dynamic programming.
- * An earlier version was flawed in that ignored the special case 1
- * and tried to replace the tail of the computation of outerSum
- * by a geometric series, but the base of the geometric series
- * was not accurately estimated in some cases.
- */
- static double
- BlastKarlinLHtoK(Blast_ScoreFreq* sfp, double lambda, double H)
- {
- /*The next array stores the probabilities of getting each possible
- score in an alignment of fixed length; the array is shifted
- during part of the computation, so that
- entry 0 is for score 0. */
- double *alignmentScoreProbabilities = NULL;
- Int4 low; /* Lowest score (must be negative) */
- Int4 high; /* Highest score (must be positive) */
- Int4 range; /* range of scores, computed as high - low*/
- double K; /* local copy of K to return*/
- int i; /*loop index*/
- int iterCounter; /*counter on iterations*/
- Int4 divisor; /*candidate divisor of all scores with
- non-zero probabilities*/
- /*highest and lowest possible alignment scores for current length*/
- Int4 lowAlignmentScore, highAlignmentScore;
- Int4 first, last; /*loop indices for dynamic program*/
- register double innerSum;
- double oldsum, oldsum2; /* values of innerSum on previous
- iterations*/
- double outerSum; /* holds sum over j of (innerSum
- for iteration j/j)*/
- double score_avg; /*average score*/
- /*first term to use in the closed form for the case where
- high == 1 or low == -1, but not both*/
- double firstTermClosedForm; /*usually store H/lambda*/
- int iterlimit; /*upper limit on iterations*/
- double sumlimit; /*lower limit on contributions
- to sum over scores*/
- /*array of score probabilities reindexed so that low is at index 0*/
- double *probArrayStartLow;
- /*pointers used in dynamic program*/
- double *ptrP, *ptr1, *ptr2, *ptr1e;
- double expMinusLambda; /*e^^(-Lambda) */
- if (lambda <= 0. || H <= 0.) {
- /* Theory dictates that H and lambda must be positive, so
- * return -1 to indicate an error */
- return -1.;
- }
- /*Karlin-Altschul theory works only if the expected score
- is negative*/
- if (sfp->score_avg >= 0.0) {
- return -1.;
- }
- low = sfp->obs_min;
- high = sfp->obs_max;
- range = high - low;
- probArrayStartLow = &sfp->sprob[low];
- /* Look for the greatest common divisor ("delta" in Appendix of PNAS 87 of
- Karlin&Altschul (1990) */
- for (i = 1, divisor = -low; i <= range && divisor > 1; ++i) {
- if (probArrayStartLow[i])
- divisor = BLAST_Gcd(divisor, i);
- }
- high /= divisor;
- low /= divisor;
- lambda *= divisor;
- range = high - low;
- firstTermClosedForm = H/lambda;
- expMinusLambda = exp((double) -lambda);
- if (low == -1 && high == 1) {
- K = (sfp->sprob[low] - sfp->sprob[high]) *
- (sfp->sprob[low] - sfp->sprob[high]) / sfp->sprob[low];
- return(K);
- }
- if (low == -1 || high == 1) {
- if (high == 1)
- ;
- else {
- score_avg = sfp->score_avg / divisor;
- firstTermClosedForm
- = (score_avg * score_avg) / firstTermClosedForm;
- }
- return firstTermClosedForm * (1.0 - expMinusLambda);
- }
- sumlimit = BLAST_KARLIN_K_SUMLIMIT_DEFAULT;
- iterlimit = BLAST_KARLIN_K_ITER_MAX;
- if (DIMOFP0 > DIMOFP0_MAX) {
- return -1.;
- }
- alignmentScoreProbabilities =
- (double *)calloc(DIMOFP0, sizeof(*alignmentScoreProbabilities));
- if (alignmentScoreProbabilities == NULL)
- return -1.;
- outerSum = 0.;
- lowAlignmentScore = highAlignmentScore = 0;
- alignmentScoreProbabilities[0] = innerSum = oldsum = oldsum2 = 1.;
- for (iterCounter = 0;
- ((iterCounter < iterlimit) && (innerSum > sumlimit));
- outerSum += innerSum /= ++iterCounter) {
- first = last = range;
- lowAlignmentScore += low;
- highAlignmentScore += high;
- /*dynamic program to compute P(i,j)*/
- for (ptrP = alignmentScoreProbabilities +
- (highAlignmentScore-lowAlignmentScore);
- ptrP >= alignmentScoreProbabilities;
- *ptrP-- =innerSum) {
- ptr1 = ptrP - first;
- ptr1e = ptrP - last;
- ptr2 = probArrayStartLow + first;
- for (innerSum = 0.; ptr1 >= ptr1e; )
- innerSum += *ptr1-- * *ptr2++;
- if (first)
- --first;
- if (ptrP - alignmentScoreProbabilities <= range)
- --last;
- }
- /* Horner's rule */
- innerSum = *++ptrP;
- for( i = lowAlignmentScore + 1; i < 0; i++ ) {
- innerSum = *++ptrP + innerSum * expMinusLambda;
- }
- innerSum *= expMinusLambda;
- for (; i <= highAlignmentScore; ++i)
- innerSum += *++ptrP;
- oldsum2 = oldsum;
- oldsum = innerSum;
- }
- #ifdef ADD_GEOMETRIC_TERMS_TO_K
- /*old code assumed that the later terms in sum were
- asymptotically comparable to those of a geometric
- progression, and tried to speed up convergence by
- guessing the estimated ratio between sucessive terms
- and using the explicit terms of a geometric progression
- to speed up convergence. However, the assumption does not
- always hold, and convergenece of the above code is fast
- enough in practice*/
- /* Terms of geometric progression added for correction */
- {
- double ratio; /* fraction used to generate the
- geometric progression */
- ratio = oldsum / oldsum2;
- if (ratio >= (1.0 - sumlimit*0.001)) {
- K = -1.;
- if (alignmentScoreProbabilities != NULL)
- sfree(alignmentScoreProbabilities);
- return K;
- }
- sumlimit *= 0.01;
- while (innerSum > sumlimit) {
- oldsum *= ratio;
- outerSum += innerSum = oldsum / ++iterCounter;
- }
- }
- #endif
- if (expMinusLambda < SMALL_LAMBDA_THRESHOLD ) {
- K = -exp((double)-2.0*outerSum) /
- (firstTermClosedForm*(expMinusLambda - 1.0));
- } else {
- K = -exp((double)-2.0*outerSum) /
- (firstTermClosedForm*BLAST_Expm1(-(double)lambda));
- }
- if (alignmentScoreProbabilities != NULL)
- sfree(alignmentScoreProbabilities);
- return K;
- }
- /**
- * Find positive solution to sum_{i=low}^{high} exp(i lambda) = 1.
- *
- * @param probs probabilities of a score occurring
- * @param d the gcd of the possible scores. This equals 1 if the scores
- * are not a lattice
- * @param low the lowest possible score
- * @param high the highest possible score
- * @param lambda0 an initial value for lambda
- * @param tolx the tolerance to which lambda must be computed
- * @param itmax the maximum number of times the function may be
- * evaluated
- * @param maxNewton the maximum permissible number of Newton
- * iteration. After that the computation will proceed by bisection.
- * @param itn a pointer to an integer that will receive the actually
- * number of iterations performed.
- *
- * Let phi(lambda) = sum_{i=low}^{high} exp(i lambda) - 1. Then phi(lambda)
- * may be written
- *
- * phi(lamdba) = exp(u lambda) p( exp(-lambda) )
- *
- * where p(x) is a polynomial that has exactly two zeros, one at x = 1
- * and one at y = exp(-lamdba). It is simpler, more numerically
- * efficient and stable to apply Newton's method to p(x) than to
- * phi(lambda).
- *
- * We define a safeguarded Newton iteration as follows. Let the
- * initial interval of uncertainty be [0,1]. If p'(x) >= 0, we bisect
- * the interval. Otherwise we try a Newton step. If the Newton iterate
- * lies in the current interval of uncertainty and it reduces the
- * value of | p(x) | by at least 10%, we accept the new
- * point. Otherwise, we bisect the current interval of uncertainty.
- * It is clear that this method converges to a zero of p(x). Since
- * p'(x) > 0 in an interval containing x = 1, the method cannot
- * converge to x = 1 and therefore converges to the only other zero,
- * y.
- */
- static double
- NlmKarlinLambdaNR( double* probs, Int4 d, Int4 low, Int4 high, double lambda0, double tolx,
- Int4 itmax, Int4 maxNewton, Int4 * itn )
- {
- Int4 k;
- double x0, x, a = 0, b = 1;
- double f = 4; /* Larger than any possible value of the poly in [0,1] */
- Int4 isNewton = 0; /* we haven't yet taken a Newton step. */
- assert( d > 0 );
- x0 = exp( -lambda0 );
- x = ( 0 < x0 && x0 < 1 ) ? x0 : .5;
-
- for( k = 0; k < itmax; k++ ) { /* all iteration indices k */
- Int4 i;
- double g, fold = f;
- Int4 wasNewton = isNewton; /* If true, then the previous step was a */
- /* Newton step */
- isNewton = 0; /* Assume that this step is not */
-
- /* Horner's rule for evaluating a polynomial and its derivative */
- g = 0;
- f = probs[low];
- for( i = low + d; i < 0; i += d ) {
- g = x * g + f;
- f = f * x + probs[i];
- }
- g = x * g + f;
- f = f * x + probs[0] - 1;
- for( i = d; i <= high; i += d ) {
- g = x * g + f;
- f = f * x + probs[i];
- }
- /* End Horner's rule */
- if( f > 0 ) {
- a = x; /* move the left endpoint */
- } else if( f < 0 ) {
- b = x; /* move the right endpoint */
- } else { /* f == 0 */
- break; /* x is an exact solution */
- }
- if( b - a < 2 * a * ( 1 - b ) * tolx ) {
- /* The midpoint of the interval converged */
- x = (a + b) / 2; break;
- }
- if( k >= maxNewton ||
- /* If convergence of Newton's method appears to be failing; or */
- ( wasNewton && fabs( f ) > .9 * fabs(fold) ) ||
- /* if the previous iteration was a Newton step but didn't decrease
- * f sufficiently; or */
- g >= 0
- /* if a Newton step will move us away from the desired solution */
- ) { /* then */
- /* bisect */
- x = (a + b)/2;
- } else {
- /* try a Newton step */
- double p = - f/g;
- double y = x + p;
- if( y <= a || y >= b ) { /* The proposed iterate is not in (a,b) */
- x = (a + b)/2;
- } else { /* The proposed iterate is in (a,b). Accept it. */
- isNewton = 1;
- x = y;
- if( fabs( p ) < tolx * x * (1-x) ) break; /* Converged */
- } /* else the proposed iterate is in (a,b) */
- } /* else try a Newton step. */
- } /* end for all iteration indices k */
- *itn = k;
- return -log(x)/d;
- }
- double
- Blast_KarlinLambdaNR(Blast_ScoreFreq* sfp, double initialLambdaGuess)
- {
- Int4 low; /* Lowest score (must be negative) */
- Int4 high; /* Highest score (must be positive) */
- Int4 itn;
- Int4 i, d;
- double* sprob;
- double returnValue;
- low = sfp->obs_min;
- high = sfp->obs_max;
- if (sfp->score_avg >= 0.) { /* Expected score must be negative */
- return -1.0;
- }
- if (BlastScoreChk(low, high) != 0) return -1.;
-
- sprob = sfp->sprob;
- /* Find greatest common divisor of all scores */
- for (i = 1, d = -low; i <= high-low && d > 1; ++i) {
- if (sprob[i+low] != 0) {
- d = BLAST_Gcd(d, i);
- }
- }
- returnValue =
- NlmKarlinLambdaNR( sprob, d, low, high,
- initialLambdaGuess,
- BLAST_KARLIN_LAMBDA_ACCURACY_DEFAULT,
- 20, 20 + BLAST_KARLIN_LAMBDA_ITER_DEFAULT, &itn );
- return returnValue;
- }
- /*
- BlastKarlinLtoH
- Calculate H, the relative entropy of the p's and q's
- */
- static double
- BlastKarlinLtoH(Blast_ScoreFreq* sfp, double lambda)
- {
- Int4 score;
- double H, etonlam, sum, scale;
- double *probs = sfp->sprob;
- Int4 low = sfp->obs_min, high = sfp->obs_max;
- if (lambda < 0.) {
- return -1.;
- }
- if (BlastScoreChk(low, high) != 0) return -1.;
- etonlam = exp( - lambda );
- sum = low * probs[low];
- for( score = low + 1; score <= high; score++ ) {
- sum = score * probs[score] + etonlam * sum;
- }
- scale = BLAST_Powi( etonlam, high );
- if( scale > 0 ) {
- H = lambda * sum/scale;
- } else { /* Underflow of exp( -lambda * high ) */
- H = lambda * exp( lambda * high + log(sum) );
- }
- return H;
- }
- /**************** Statistical Significance Parameter Subroutine ****************
- Version 1.0 February 2, 1990
- Version 1.2 July 6, 1990
- Program by: Stephen Altschul
- Address: National Center for Biotechnology Information
- National Library of Medicine
- National Institutes of Health
- Bethesda, MD 20894
- Internet: altschul@ncbi.nlm.nih.gov
- See: Karlin, S. & Altschul, S.F. "Methods for Assessing the Statistical
- Significance of Molecular Sequence Features by Using General Scoring
- Schemes," Proc. Natl. Acad. Sci. USA 87 (1990), 2264-2268.
- Computes the parameters lambda and K for use in calculating the
- statistical significance of high-scoring segments or subalignments.
- The scoring scheme must be integer valued. A positive score must be
- possible, but the expected (mean) score must be negative.
- A program that calls this routine must provide the value of the lowest
- possible score, the value of the greatest possible score, and a pointer
- to an array of probabilities for the occurrence of all scores between
- these two extreme scores. For example, if score -2 occurs with
- probability 0.7, score 0 occurs with probability 0.1, and score 3
- occurs with probability 0.2, then the subroutine must be called with
- low = -2, high = 3, and pr pointing to the array of values
- { 0.7, 0.0, 0.1, 0.0, 0.0, 0.2 }. The calling program must also provide
- pointers to lambda and K; the subroutine will then calculate the values
- of these two parameters. In this example, lambda=0.330 and K=0.154.
- The parameters lambda and K can be used as follows. Suppose we are
- given a length N random sequence of independent letters. Associated
- with each letter is a score, and the probabilities of the letters
- determine the probability for each score. Let S be the aggregate score
- of the highest scoring contiguous segment of this sequence. Then if N
- is sufficiently large (greater than 100), the following bound on the
- probability that S is greater than or equal to x applies:
- P( S >= x ) <= 1 - exp [ - KN exp ( - lambda * x ) ].
- In other words, the p-value for this segment can be written as
- 1-exp[-KN*exp(-lambda*S)].
- This formula can be applied to pairwise sequence comparison by assigning
- scores to pairs of letters (e.g. amino acids), and by replacing N in the
- formula with N*M, where N and M are the lengths of the two sequences
- being compared.
- In addition, letting y = KN*exp(-lambda*S), the p-value for finding m
- distinct segments all with score >= S is given by:
- 2 m-1 -y
- 1 - [ 1 + y + y /2! + ... + y /(m-1)! ] e
- Notice that for m=1 this formula reduces to 1-exp(-y), which is the same
- as the previous formula.
- *******************************************************************************/
- Int2
- Blast_KarlinBlkCalc(Blast_KarlinBlk* kbp, Blast_ScoreFreq* sfp)
- {
-
- if (kbp == NULL || sfp == NULL)
- return 1;
- /* Calculate the parameter Lambda */
- kbp->Lambda = Blast_KarlinLambdaNR(sfp, BLAST_KARLIN_LAMBDA0_DEFAULT);
- if (kbp->Lambda < 0.)
- goto ErrExit;
- /* Calculate H */
- kbp->H = BlastKarlinLtoH(sfp, kbp->Lambda);
- if (kbp->H < 0.)
- goto ErrExit;
- /* Calculate K and log(K) */
- kbp->K = BlastKarlinLHtoK(sfp, kbp->Lambda, kbp->H);
- if (kbp->K < 0.)
- goto ErrExit;
- kbp->logK = log(kbp->K);
- /* Normal return */
- return 0;
- ErrExit:
- kbp->Lambda = kbp->H = kbp->K = -1.;
- #ifdef BLASTKAR_HUGE_VAL
- kbp->logK = BLASTKAR_HUGE_VAL;
- #else
- kbp->logK = 1.e30;
- #endif
- return 1;
- }
- /*
- Calculate the Karlin parameters. This function should be called once
- for each context, or frame translated.
-
- The rfp and stdrfp are calculated for each context, this should be
- fixed.
- */
- Int2
- BLAST_ScoreBlkFill(BlastScoreBlk* sbp, char* query, Int4 query_length, Int4 context_number)
- {
- Blast_ResFreq* rfp,* stdrfp;
- Int2 retval=0;
- rfp = Blast_ResFreqNew(sbp);
- stdrfp = Blast_ResFreqNew(sbp);
- Blast_ResFreqStdComp(sbp, stdrfp);
- Blast_ResFreqString(sbp, rfp, query, query_length);
- sbp->sfp[context_number] = Blast_ScoreFreqNew(sbp->loscore, sbp->hiscore);
- BlastScoreFreqCalc(sbp, sbp->sfp[context_number], rfp, stdrfp);
- sbp->kbp_std[context_number] = Blast_KarlinBlkCreate();
- retval = Blast_KarlinBlkCalc(sbp->kbp_std[context_number], sbp->sfp[context_number]);
- if (retval)
- {
- rfp = Blast_ResFreqDestruct(rfp);
- stdrfp = Blast_ResFreqDestruct(stdrfp);
- return retval;
- }
- sbp->kbp_psi[context_number] = Blast_KarlinBlkCreate();
- retval = Blast_KarlinBlkCalc(sbp->kbp_psi[context_number], sbp->sfp[context_number]);
- rfp = Blast_ResFreqDestruct(rfp);
- stdrfp = Blast_ResFreqDestruct(stdrfp);
- return retval;
- }
- /*
- Calculates the standard Karlin parameters. This is used
- if the query is translated and the calculated (real) Karlin
- parameters are bad, as they're calculated for non-coding regions.
- */
- Blast_KarlinBlk*
- Blast_KarlinBlkIdealCalc(BlastScoreBlk* sbp)
- {
- Blast_KarlinBlk* kbp_ideal;
- Blast_ResFreq* stdrfp;
- Blast_ScoreFreq* sfp;
- stdrfp = Blast_ResFreqNew(sbp);
- Blast_ResFreqStdComp(sbp, stdrfp);
- sfp = Blast_ScoreFreqNew(sbp->loscore, sbp->hiscore);
- BlastScoreFreqCalc(sbp, sfp, stdrfp, stdrfp);
- kbp_ideal = Blast_KarlinBlkCreate();
- Blast_KarlinBlkCalc(kbp_ideal, sfp);
- stdrfp = Blast_ResFreqDestruct(stdrfp);
- sfp = Blast_ScoreFreqDestruct(sfp);
- return kbp_ideal;
- }
- Int2
- Blast_KarlinBlkStandardCalc(BlastScoreBlk* sbp, Int4 context_start, Int4 context_end)
- {
- Blast_KarlinBlk* kbp_ideal,* kbp;
- Int4 index;
- kbp_ideal = Blast_KarlinBlkIdealCalc(sbp);
- /* Replace the calculated values with ideal ones for blastx, tblastx. */
- for (index=context_start; index<=context_end; index++)
- {
- kbp = sbp->kbp[index];
- if (!kbp)
- continue;
- if (kbp->Lambda >= kbp_ideal->Lambda)
- {
- kbp->Lambda = kbp_ideal->Lambda;
- kbp->K = kbp_ideal->K;
- kbp->logK = kbp_ideal->logK;
- kbp->H = kbp_ideal->H;
- }
- }
- kbp_ideal = Blast_KarlinBlkDestruct(kbp_ideal);
- return 0;
- }
- /*
- Creates the Karlin Block.
- */
- Blast_KarlinBlk*
- Blast_KarlinBlkCreate(void)
- {
- Blast_KarlinBlk* kbp;
- kbp = (Blast_KarlinBlk*) calloc(1, sizeof(Blast_KarlinBlk));
- return kbp;
- }
- static SBLASTMatrixStructure*
- BlastMatrixAllocate(Int2 alphabet_size)
- {
- SBLASTMatrixStructure* matrix_struct;
- Int2 index;
- if (alphabet_size <= 0 || alphabet_size >= BLAST_MATRIX_SIZE)
- return NULL;
- matrix_struct = (SBLASTMatrixStructure*) calloc(1, sizeof(SBLASTMatrixStructure));
- if (matrix_struct == NULL)
- return NULL;
- for (index=0; index<BLAST_MATRIX_SIZE-1; index++)
- {
- matrix_struct->matrix[index] = matrix_struct->long_matrix + index*BLAST_MATRIX_SIZE;
- }
- return matrix_struct;
- }
- /*
- Deallocates MatrixInfo*
- */
- static MatrixInfo*
- MatrixInfoDestruct(MatrixInfo* matrix_info)
- {
- if (matrix_info == NULL)
- return NULL;
- sfree(matrix_info->name);
- sfree(matrix_info);
- return NULL;
- }
- /*
- Makes New MatrixInfo*
- */
- static MatrixInfo*
- MatrixInfoNew(char* name, array_of_8 *values, Int4* prefs, Int4 max_number)
- {
- MatrixInfo* matrix_info;
- matrix_info = (MatrixInfo*) calloc(1, sizeof(MatrixInfo));
- matrix_info->name = strdup(name);
- matrix_info->values = values;
- matrix_info->prefs = prefs;
- matrix_info->max_number_values = max_number;
- return matrix_info;
- }
- static ListNode*
- BlastMatrixValuesDestruct(ListNode* vnp)
- {
- ListNode* head;
- head = vnp;
- while (vnp)
- {
- MatrixInfoDestruct((MatrixInfo*) vnp->ptr);
- vnp = vnp->next;
- }
- head = ListNodeFree(head);
- return head;
- }
- /*
- ListNode* BlastLoadMatrixValues (void)
-
- Loads all the matrix values, returns a ListNode* chain that contains
- MatrixInfo*'s.
- */
- static ListNode*
- BlastLoadMatrixValues (void)
- {
- MatrixInfo* matrix_info;
- ListNode* retval=NULL;
- matrix_info = MatrixInfoNew("BLOSUM80", blosum80_values, blosum80_prefs, BLOSUM80_VALUES_MAX);
- ListNodeAddPointer(&retval, 0, matrix_info);
- matrix_info = MatrixInfoNew("BLOSUM62", blosum62_values, blosum62_prefs, BLOSUM62_VALUES_MAX);
- ListNodeAddPointer(&retval, 0, matrix_info);
- matrix_info = MatrixInfoNew("BLOSUM50", blosum50_values, blosum50_prefs, BLOSUM50_VALUES_MAX);
- ListNodeAddPointer(&retval, 0, matrix_info);
- matrix_info = MatrixInfoNew("BLOSUM45", blosum45_values, blosum45_prefs, BLOSUM45_VALUES_MAX);
- ListNodeAddPointer(&retval, 0, matrix_info);
- matrix_info = MatrixInfoNew("PAM250", pam250_values, pam250_prefs, PAM250_VALUES_MAX);
- ListNodeAddPointer(&retval, 0, matrix_info);
- matrix_info = MatrixInfoNew("BLOSUM62_20", blosum62_20_values, blosum62_20_prefs, BLOSUM62_20_VALUES_MAX);
- ListNodeAddPointer(&retval, 0, matrix_info);
- matrix_info = MatrixInfoNew("BLOSUM90", blosum90_values, blosum90_prefs, BLOSUM90_VALUES_MAX);
- ListNodeAddPointer(&retval, 0, matrix_info);
- matrix_info = MatrixInfoNew("PAM30", pam30_values, pam30_prefs, PAM30_VALUES_MAX);
- ListNodeAddPointer(&retval, 0, matrix_info);
- matrix_info = MatrixInfoNew("PAM70", pam70_values, pam70_prefs, PAM70_VALUES_MAX);
- ListNodeAddPointer(&retval, 0, matrix_info);
- return retval;
- }
- /*
- Int2
- BlastKarlinGetMatrixValuesEx2(char* matrix, Int4* open, Int4* extension, Int4* decline_align, double* lambda, double* K, double* H)
-
- Obtains arrays of the allowed opening and extension penalties for gapped BLAST for
- the given matrix. Also obtains arrays of Lambda, K, and H. Any of these fields that
- are not required should be set to NULL. The Int2 return value is the length of the
- arrays.
- */
- static Int2
- BlastKarlinGetMatrixValuesEx2(char* matrix, Int4** open, Int4** extension, Int4** decline_align, double** lambda, double** K, double** H, double** alpha, double** beta, Int4** pref_flags)
- {
- array_of_8 *values = NULL;
- Boolean found_matrix=FALSE;
- Int4 index, max_number_values=0;
- Int4* open_array=NULL,* extension_array=NULL,* decline_align_array=NULL,* pref_flags_array=NULL,* prefs=NULL;
- double* lambda_array=NULL,* K_array=NULL,* H_array=NULL,* alpha_array=NULL,* beta_array=NULL;
- MatrixInfo* matrix_info;
- ListNode* vnp,* head;
- if (matrix == NULL)
- return 0;
- vnp = head = BlastLoadMatrixValues();
- while (vnp)
- {
- matrix_info = vnp->ptr;
- if (strcasecmp(matrix_info->name, matrix) == 0)
- {
- values = matrix_info->values;
- max_number_values = matrix_info->max_number_values;
- prefs = matrix_info->prefs;
- found_matrix = TRUE;
- break;
- }
- vnp = vnp->next;
- }
- if (found_matrix)
- {
- if (open)
- *open = open_array = (Int4 *) calloc(max_number_values, sizeof(Int4));
- if (extension)
- *extension = extension_array =
- (Int4 *) calloc(max_number_values, sizeof(Int4));
- if (decline_align)
- *decline_align = decline_align_array =
- (Int4 *) calloc(max_number_values, sizeof(Int4));
- if (lambda)
- *lambda = lambda_array =
- (double*) calloc(max_number_values, sizeof(double));
- if (K)
- *K = K_array = (double*) calloc(max_number_values, sizeof(double));
- if (H)
- *H = H_array = (double*) calloc(max_number_values, sizeof(double));
- if (alpha)
- *alpha = alpha_array = (double*) calloc(max_number_values, sizeof(double));
- if (beta)
- *beta = beta_array = (double*) calloc(max_number_values, sizeof(double));
- if (pref_flags)
- *pref_flags = pref_flags_array =
- (Int4 *) calloc(max_number_values, sizeof(Int4));
- for (index=0; index<max_number_values; index++)
- {
- if (open)
- open_array[index] = (Int4) values[index][0];
- if (extension)
- extension_array[index] = (Int4) values[index][1];
- if (decline_align)
- decline_align_array[index] = (Int4) values[index][2];
- if (lambda)
- lambda_array[index] = values[index][3];
- if (K)
- K_array[index] = values[index][4];
- if (H)
- H_array[index] = values[index][5];
- if (alpha)
- alpha_array[index] = values[index][6];
- if (beta)
- beta_array[index] = values[index][7];
- if (pref_flags)
- pref_flags_array[index] = prefs[index];
- }
- }
- BlastMatrixValuesDestruct(head);
- return max_number_values;
- }
- /*Extract the alpha and beta settings for this matrixName, and these
- gap open and gap extension costs*/
- void BLAST_GetAlphaBeta(char* matrixName, double *alpha,
- double *beta, Boolean gapped, Int4 gap_open, Int4 gap_extend)
- {
- Int4* gapOpen_arr,* gapExtend_arr,* pref_flags;
- double* alpha_arr,* beta_arr;
- Int2 num_values;
- Int4 i; /*loop index*/
- num_values = BlastKarlinGetMatrixValuesEx2(matrixName, &gapOpen_arr,
- &gapExtend_arr, NULL, NULL, NULL, NULL, &alpha_arr, &beta_arr,
- &pref_flags);
- if (gapped) {
- if ((0 == gap_open) && (0 == gap_extend)) {
- for(i = 1; i < num_values; i++) {
- if(pref_flags[i]==BLAST_MATRIX_BEST) {
- (*alpha) = alpha_arr[i];
- (*beta) = beta_arr[i];
- break;
- }
- }
- }
- else {
- for(i = 1; i < num_values; i++) {
- if ((gapOpen_arr[i] == gap_open) &&
- (gapExtend_arr[i] == gap_extend)) {
- (*alpha) = alpha_arr[i];
- (*beta) = beta_arr[i];
- break;
- }
- }
- }
- }
- else if (num_values > 0) {
- (*alpha) = alpha_arr[0];
- (*beta) = beta_arr[0];
- }
- sfree(gapOpen_arr);
- sfree(gapExtend_arr);
- sfree(pref_flags);
- sfree(alpha_arr);
- sfree(beta_arr);
- }
- static Int2
- BlastKarlinReportAllowedValues(const char *matrix_name,
- Blast_Message** error_return)
- {
- array_of_8 *values = NULL;
- Boolean found_matrix=FALSE;
- char buffer[256];
- Int4 index, max_number_values=0;
- MatrixInfo* matrix_info;
- ListNode* vnp,* head;
- vnp = head = BlastLoadMatrixValues();
- while (vnp)
- {
- matrix_info = vnp->ptr;
- if (strcasecmp(matrix_info->name, matrix_name) == 0)
- {
- values = matrix_info->values;
- max_number_values = matrix_info->max_number_values;
- found_matrix = TRUE;
- break;
- }
- vnp = vnp->next;
- }
- if (found_matrix)
- {
- for (index=0; index<max_number_values; index++)
- {
- if (BLAST_Nint(values[index][2]) == INT2_MAX)
- sprintf(buffer, "Gap existence and extension values of %ld and %ld are supported", (long) BLAST_Nint(values[index][0]), (long) BLAST_Nint(values[index][1]));
- else
- sprintf(buffer, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld are supported", (long) BLAST_Nint(values[index][0]), (long) BLAST_Nint(values[index][1]), (long) BLAST_Nint(values[index][2]));
- Blast_MessageWrite(error_return, BLAST_SEV_ERROR, 0, 0, buffer);
- }
- }
- BlastMatrixValuesDestruct(head);
- return 0;
- }
- /*
- Supplies lambda, H, and K values, as calcualted by Stephen Altschul
- in Methods in Enzy. (vol 266, page 474).
- if kbp is NULL, then a validation is perfomed.
- */
- Int2
- Blast_KarlinBlkGappedCalc(Blast_KarlinBlk* kbp, Int4 gap_open, Int4 gap_extend, Int4 decline_align, char* matrix_name, Blast_Message** error_return)
- {
- char buffer[256];
- Int2 status = Blast_KarlinkGapBlkFill(kbp, gap_open, gap_extend, decline_align, matrix_name);
- if (status && error_return)
- {
- if (status == 1)
- {
- MatrixInfo* matrix_info;
- ListNode* vnp,* head;
- vnp = head = BlastLoadMatrixValues();
- sprintf(buffer, "%s is not a supported matrix", matrix_name);
- Blast_MessageWrite(error_return, BLAST_SEV_ERROR, 0, 0, buffer);
- while (vnp)
- {
- matrix_info = vnp->ptr;
- sprintf(buffer, "%s is a supported matrix", matrix_info->name);
- Blast_MessageWrite(error_return, BLAST_SEV_ERROR, 0, 0, buffer);
- vnp = vnp->next;
- }
- BlastMatrixValuesDestruct(head);
- }
- else if (status == 2)
- {
- if (decline_align == INT2_MAX)
- sprintf(buffer, "Gap existence and extension values of %ld and %ld not supported for %s", (long) gap_open, (long) gap_extend, matrix_name);
- else
- sprintf(buffer, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld not supported for %s", (long) gap_open, (long) gap_extend, (long) decline_align, matrix_name);
- Blast_MessageWrite(error_return, BLAST_SEV_ERROR, 0, 0, buffer);
- BlastKarlinReportAllowedValues(matrix_name, error_return);
- }
- }
- return status;
- }
- /*
- Attempts to fill KarlinBlk for given gap opening, extensions etc.
- Will return non-zero status if that fails.
- return values: -1 if matrix_name is NULL;
- 1 if matrix not found
- 2 if matrix found, but open, extend etc. values not supported.
- */
- Int2
- Blast_KarlinkGapBlkFill(Blast_KarlinBlk* kbp, Int4 gap_open, Int4 gap_extend, Int4 decline_align, char* matrix_name)
- {
- Boolean found_matrix=FALSE, found_values=FALSE;
- array_of_8 *values;
- Int2 status=0;
- Int4 index, max_number_values=0;
- MatrixInfo* matrix_info;
- ListNode* vnp,* head;
-
- if (matrix_name == NULL)
- return -1;
- values = NULL;
- vnp = head = BlastLoadMatrixValues();
- while (vnp)
- {
- matrix_info = vnp->ptr;
- if (strcasecmp(matrix_info->name, matrix_name) == 0)
- {
- values = matrix_info->values;
- max_number_values = matrix_info->max_number_values;
- found_matrix = TRUE;
- break;
- }
- vnp = vnp->next;
- }
- if (found_matrix)
- {
- for (index=0; index<max_number_values; index++)
- {
- if (BLAST_Nint(values[index][0]) == gap_open &&
- BLAST_Nint(values[index][1]) == gap_extend &&
- (BLAST_Nint(values[index][2]) == INT2_MAX || BLAST_Nint(values[index][2]) == decline_align))
- {
- if (kbp)
- {
- kbp->Lambda = values[index][3];
- kbp->K = values[index][4];
- kbp->logK = log(kbp->K);
- kbp->H = values[index][5];
- }
- found_values = TRUE;
- break;
- }
- }
- if (found_values == TRUE)
- {
- status = 0;
- }
- else
- {
- status = 2;
- }
- }
- else
- {
- status = 1;
- }
- BlastMatrixValuesDestruct(head);
- return status;
- }
- char*
- BLAST_PrintMatrixMessage(const char *matrix_name)
- {
- char* buffer= (char *) calloc(1024, sizeof(char));
- char* ptr;
- MatrixInfo* matrix_info;
- ListNode* vnp,* head;
- ptr = buffer;
- sprintf(ptr, "%s is not a supported matrix, supported matrices are:n", matrix_name);
- ptr += strlen(ptr);
- vnp = head = BlastLoadMatrixValues();
- while (vnp)
- {
- matrix_info = vnp->ptr;
- sprintf(ptr, "%s n", matrix_info->name);
- ptr += strlen(ptr);
- vnp = vnp->next;
- }
- BlastMatrixValuesDestruct(head);
- return buffer;
- }
- char*
- BLAST_PrintAllowedValues(const char *matrix_name, Int4 gap_open, Int4 gap_extend, Int4 decline_align)
- {
- array_of_8 *values = NULL;
- Boolean found_matrix=FALSE;
- char* buffer,* ptr;
- Int4 index, max_number_values=0;
- MatrixInfo* matrix_info;
- ListNode* vnp,* head;
- ptr = buffer = (char *) calloc(2048, sizeof(char));
- if (decline_align == INT2_MAX)
- sprintf(ptr, "Gap existence and extension values of %ld and %ld not supported for %snsupported values are:n",
- (long) gap_open, (long) gap_extend, matrix_name);
- else
- sprintf(ptr, "Gap existence, extension and decline-to-align values of %ld, %ld and %ld not supported for %sn supported values are:n",
- (long) gap_open, (long) gap_extend, (long) decline_align, matrix_name);
- ptr += strlen(ptr);
- vnp = head = BlastLoadMatrixValues();
- while (vnp)
- {
- matrix_info = vnp->ptr;
- if (strcasecmp(matrix_info->name, matrix_name) == 0)
- {
- values = matrix_info->values;
- max_number_values = matrix_info->max_number_values;
- found_matrix = TRUE;
- break;
- }
- vnp = vnp->next;
- }
- if (found_matrix)
- {
- for (index=0; index<max_number_values; index++)
- {
- if (BLAST_Nint(values[index][2]) == INT2_MAX)
- sprintf(ptr, "%ld, %ldn", (long) BLAST_Nint(values[index][0]), (long) BLAST_Nint(values[index][1]));
- else
- sprintf(ptr, "%ld, %ld, %ldn", (long) BLAST_Nint(values[index][0]), (long) BLAST_Nint(values[index][1]), (long) BLAST_Nint(values[index][2]));
- ptr += strlen(ptr);
- }
- }
- BlastMatrixValuesDestruct(head);
- return buffer;
- }
-
- /* Smallest float that might not cause a floating point exception in
- S = (Int4) (ceil( log((double)(K * searchsp / E)) / Lambda ));
- below.
- */
- #define BLASTKAR_SMALL_FLOAT 1.0e-297
- static Int4
- BlastKarlinEtoS_simple(double E, /* Expect value */
- Blast_KarlinBlk* kbp,
- Int8 searchsp) /* size of search space */
- {
- double Lambda, K, H; /* parameters for Karlin statistics */
- Int4 S;
- Lambda = kbp->Lambda;
- K = kbp->K;
- H = kbp->H;
- if (Lambda < 0. || K < 0. || H < 0.)
- {
- return BLAST_SCORE_MIN;
- }
- E = MAX(E, BLASTKAR_SMALL_FLOAT);
- S = (Int4) (ceil( log((double)(K * searchsp / E)) / Lambda ));
- return S;
- }
- /* Compute a divisor used to weight the evalue of a collection of
- * "nsegs" distinct alignments. These divisors are used to compensate
- * for the effect of choosing the best among multiple collections of
- * alignments. See
- *
- * Stephen F. Altschul. Evaluating the statitical significance of
- * multiple distinct local alignments. In Suhai, editior, Theoretical
- * and Computational Methods in Genome Research, pages 1-14. Plenum
- * Press, New York, 1997.
- *
- * The "decayrate" parameter of this routine is a value in the
- * interval (0,1). Typical values of decayrate are .1 and .5. */
- double
- BLAST_GapDecayDivisor(double decayrate, unsigned nsegs )
- {
- return (1. - decayrate) * BLAST_Powi(decayrate, nsegs - 1);
- }
- /*
- BlastCutoffs
- Calculate the cutoff score, S, and the highest expected score.
- */
- Int2
- BLAST_Cutoffs(Int4 *S, /* cutoff score */
- double* E, /* expected no. of HSPs scoring at or above S */
- Blast_KarlinBlk* kbp,
- Int8 searchsp, /* size of search space. */
- Boolean dodecay, /* TRUE ==> use gapdecay feature */
- double gap_decay_rate)
- {
- Int4 s = *S, es;
- double e = *E, esave;
- Boolean s_changed = FALSE;
- if (kbp->Lambda == -1. || kbp->K == -1. || kbp->H == -1.)
- return 1;
- /*
- Calculate a cutoff score, S, from the Expected
- (or desired) number of reported HSPs, E.
- */
- es = 1;
- esave = e;
- if (e > 0.)
- {
- if (dodecay) {
- /* Invert the adjustment to the e-value that will be applied
- * to compensate for the effect of choosing the best among
- * multiple alignments */
- if( gap_decay_rate > 0 && gap_decay_rate < 1 ) {
- e *= BLAST_GapDecayDivisor(gap_decay_rate, 1);
- }
- }
- es = BlastKarlinEtoS_simple(e, kbp, searchsp);
- }
- /*
- Pick the larger cutoff score between the user's choice
- and that calculated from the value of E.
- */
- if (es > s) {
- s_changed = TRUE;
- *S = s = es;
- }
- /*
- Re-calculate E from the cutoff score, if E going in was too high
- */
- if (esave <= 0. || !s_changed)
- {
- e = BLAST_KarlinStoE_simple(s, kbp, searchsp);
- if (dodecay) {
- /* Weight the e-value to compensate for the effect of
- * choosing the best of more than one collection of
- * distinct alignments */
- if( gap_decay_rate > 0 && gap_decay_rate < 1 ) {
- e /= BLAST_GapDecayDivisor(gap_decay_rate, 1);
- }
- }
- *E = e;
- }
- return 0;
- }
- /*
- BlastKarlinStoE() -- given a score, return the associated Expect value
- Error return value is -1.
- */
- double
- BLAST_KarlinStoE_simple(Int4 S,
- Blast_KarlinBlk* kbp,
- Int8 searchsp) /* size of search space. */
- {
- double Lambda, K, H; /* parameters for Karlin statistics */
- Lambda = kbp->Lambda;
- K = kbp->K;
- H = kbp->H;
- if (Lambda < 0. || K < 0. || H < 0.) {
- return -1.;
- }
- return (double) searchsp * exp((double)(-Lambda * S) + kbp->logK);
- }
- /*
- BlastKarlinPtoE -- convert a P-value to an Expect value
- */
- static double
- BlastKarlinPtoE(double p)
- {
- if (p < 0. || p > 1.0)
- {
- return INT4_MIN;
- }
- if (p == 1)
- return INT4_MAX;
- return -BLAST_Log1p(-p);
- }
- static double tab2[] = { /* table for r == 2 */
- 0.01669, 0.0249, 0.03683, 0.05390, 0.07794, 0.1111, 0.1559, 0.2146,
- 0.2890, 0.3794, 0.4836, 0.5965, 0.7092, 0.8114, 0.8931, 0.9490,
- 0.9806, 0.9944, 0.9989
- };
- static double tab3[] = { /* table for r == 3 */
- 0.9806, 0.9944, 0.9989, 0.0001682,0.0002542,0.0003829,0.0005745,0.0008587,
- 0.001278, 0.001893, 0.002789, 0.004088, 0.005958, 0.008627, 0.01240, 0.01770,
- 0.02505, 0.03514, 0.04880, 0.06704, 0.09103, 0.1220, 0.1612, 0.2097,
- 0.2682, 0.3368, 0.4145, 0.4994, 0.5881, 0.6765, 0.7596, 0.8326,
- 0.8922, 0.9367, 0.9667, 0.9846, 0.9939, 0.9980
- };
- static double tab4[] = { /* table for r == 4 */
- 2.658e-07,4.064e-07,6.203e-07,9.450e-07,1.437e-06,2.181e-06,3.302e-06,4.990e-06,
- 7.524e-06,1.132e-05,1.698e-05,2.541e-05,3.791e-05,5.641e-05,8.368e-05,0.0001237,
- 0.0001823,0.0002677,0.0003915,0.0005704,0.0008275,0.001195, 0.001718, 0.002457,
- 0.003494, 0.004942, 0.006948, 0.009702, 0.01346, 0.01853, 0.02532, 0.03431,
- 0.04607, 0.06128, 0.08068, 0.1051, 0.1352, 0.1719, 0.2157, 0.2669,
- 0.3254, 0.3906, 0.4612, 0.5355, 0.6110, 0.6849, 0.7544, 0.8168,
- 0.8699, 0.9127, 0.9451, 0.9679, 0.9827, 0.9915, 0.9963
- };
- static double* table[] = { tab2, tab3, tab4 };
- static short tabsize[] = { DIM(tab2)-1, DIM(tab3)-1, DIM(tab4)-1 };
- static double f (double,void*);
- static double g (double,void*);
- /*
- Estimate the Sum P-value by calculation or interpolation, as appropriate.
- Approx. 2-1/2 digits accuracy minimum throughout the range of r, s.
- r = number of segments
- s = total score (in nats), adjusted by -r*log(KN)
- */
- static double
- BlastSumP(Int4 r, double s)
- {
- Int4 i, r1, r2;
- double a;
- if (r == 1)
- return -BLAST_Expm1(-exp(-s));
- if (r <= 4) {
- if (r < 1)
- return 0.;
- r1 = r - 1;
- if (s >= r*r+r1) {
- a = BLAST_LnGammaInt(r+1);
- return r * exp(r1*log(s)-s-a-a);
- }
- if (s > -2*r) {
- /* interpolate */
- i = (Int4) (a = s+s+(4*r));
- a -= i;
- i = tabsize[r2 = r - 2] - i;
- return a*table[r2][i-1] + (1.-a)*table[r2][i];
- }
- return 1.;
- }
- return BlastSumPCalc(r, s);
- }
- /*
- BlastSumPCalc
- Evaluate the following double integral, where r = number of segments
- and s = the adjusted score in nats:
- (r-2) oo oo
- Prob(r,s) = r - - (r-2)
- ------------- | exp(-y) | x exp(-exp(x - y/r)) dx dy
- (r-1)! (r-2)! U U
- s 0
- */
- static double
- BlastSumPCalc(int r, double s)
- {
- int r1, itmin;
- double t, d, epsilon;
- double est_mean, mean, stddev, stddev4;
- double xr, xr1, xr2, logr;
- double args[6];
- epsilon = BLAST_SUMP_EPSILON_DEFAULT; /* accuracy for SumP calcs. */
- if (r == 1) {
- if (s > 8.)
- return exp(-s);
- return -BLAST_Expm1(-exp(-s));
- }
- if (r < 1)
- return 0.;
- xr = r;
- if (r < 8) {
- if (s <= -2.3*xr)
- return 1.;
- }
- else if (r < 15) {
- if (s <= -2.5*xr)
- return 1.;
- }
- else if (r < 27) {
- if (s <= -3.0*xr)
- return 1.;
- }
- else if (r < 51) {
- if (s <= -3.4*xr)
- return 1.;
- }
- else if (r < 101) {
- if (s <= -4.0*xr)
- return 1.;
- }
- /* stddev in the limit of infinite r, but quite good for even small r */
- stddev = sqrt(xr);
- stddev4 = 4.*stddev;
- xr1 = r1 = r - 1;
- if (r > 100) {
- /* Calculate lower bound on the mean using inequality log(r) <= r */
- est_mean = -xr * xr1;
- if (s <= est_mean - stddev4)
- return 1.;
- }
- /* mean is rather close to the mode, and the mean is readily calculated */
- /* mean in the limit of infinite r, but quite good for even small r */
- logr = log(xr);
- mean = xr * (1. - logr) - 0.5;
- if (s <= mean - stddev4)
- return 1.;
- if (s >= mean) {
- t = s + 6.*stddev;
- itmin = 1;
- }
- else {
- t = mean + 6.*stddev;
- itmin = 2;
- }
- #define ARG_R args[0]
- #define ARG_R2 args[1]
- #define ARG_ADJ1 args[2]
- #define ARG_ADJ2 args[3]
- #define ARG_SDIVR args[4]
- #define ARG_EPS args[5]
- ARG_R = xr;
- ARG_R2 = xr2 = r - 2;
- ARG_ADJ1 = xr2*logr - BLAST_LnGammaInt(r1) - BLAST_LnGammaInt(r);
- ARG_EPS = epsilon;
- do {
- d = BLAST_RombergIntegrate(g, args, s, t, epsilon, 0, itmin);
- #ifdef BLASTKAR_HUGE_VAL
- if (d == BLASTKAR_HUGE_VAL)
- return d;
- #endif
- } while (s < mean && d < 0.4 && itmin++ < 4);
- return (d < 1. ? d : 1.);
- }
- static double
- g(double s, void* vp)
- {
- register double* args = vp;
- double mx;
-
- ARG_ADJ2 = ARG_ADJ1 - s;
- ARG_SDIVR = s / ARG_R; /* = s / r */
- mx = (s > 0. ? ARG_SDIVR + 3. : 3.);
- return BLAST_RombergIntegrate(f, vp, 0., mx, ARG_EPS, 0, 1);
- }
- static double
- f(double x, void* vp)
- {
- register double* args = vp;
- register double y;
- y = exp(x - ARG_SDIVR);
- #ifdef BLASTKAR_HUGE_VAL
- if (y == BLASTKAR_HUGE_VAL)
- return 0.;
- #endif
- if (ARG_R2 == 0.)
- return exp(ARG_ADJ2 - y);
- if (x == 0.)
- return 0.;
- return exp(ARG_R2*log(x) + ARG_ADJ2 - y);
- }
- /*
- Calculates the e-value for alignments with "small" gaps (typically
- under fifty residues/basepairs) following ideas of Stephen Altschul's.
- */
- double
- BLAST_SmallGapSumE(
- Blast_KarlinBlk * kbp, /* statistical parameters */
- Int4 gap, /* maximum size of gaps between alignments */
- Int2 num, /* the number of distinct alignments in this
- * collection */
- double score_prime, /* the sum of the scores of these alignments
- * each weighted by an appropriate value of
- * Lambda */
- Int4 query_length, /* the effective len of the query seq */
- Int4 subject_length, /* the effective len of the database seq */
- double weight_divisor) /* a divisor used to weight the e-value
- * when multiple collections of alignments
- * are being considered by the calling
- * routine */
- {
- double search_space; /* The effective size of the search space */
- double sum_p; /* The p-value of this set of alignments */
- double sum_e; /* The e-value of this set of alignments */
- search_space = (double)subject_length*(double)query_length;
- score_prime -= kbp->logK +
- log(search_space) + (num-1)*(kbp->logK + 2*log((double)gap));
- score_prime -= BLAST_LnFactorial((double) num);
- sum_p = BlastSumP(num, score_prime);
- sum_e = BlastKarlinPtoE(sum_p);
- if( weight_divisor == 0 || (sum_e /= weight_divisor) > INT4_MAX ) {
- sum_e = INT4_MAX;
- }
- return sum_e;
- }
- /*
- Calculates the e-value of a collection multiple distinct
- alignments with asymmetric gaps between the alignments. The gaps
- in one (protein) sequence are typically small (like in
- BLAST_SmallGapSumE) gap an the gaps in the other (translated DNA)
- sequence are possibly large (up to 4000 bp.) This routine is used
- for linking HSPs representing exons in the DNA sequence that are
- separated by introns.
- */
- double
- BLAST_UnevenGapSumE(
- Blast_KarlinBlk * kbp, /* statistical parameters */
- Int4 p_gap, /* maximum size of gaps between alignments,
- * in one sequence */
- Int4 n_gap, /* maximum size of gaps between alignments,
- * in the other sequence */
- Int2 num, /* the number of distinct alignments in this
- * collection */
- double score_prime, /* the sum of the scores of these alignments
- * each weighted by an appropriate value of
- * Lambda */
- Int4 query_length, /* the effective len of the query seq */
- Int4 subject_length, /* the effective len of the database seq */
- double weight_divisor) /* a divisor used to weight the e-value
- * when multiple collections of alignments
- * are being considered by the calling
- * routine */
- {
- double search_space; /* The effective size of the search space */
- double sum_p; /* The p-value of this set of alignments */
- double sum_e; /* The e-value of this set of alignments */
- search_space = (double)subject_length*(double)query_length;
- score_prime -=
- kbp->logK + log(search_space) +
- (num-1)*(kbp->logK + log((double)p_gap) + log((double)n_gap));
- score_prime -= BLAST_LnFactorial((double) num);
- sum_p = BlastSumP(num, score_prime);
- sum_e = BlastKarlinPtoE(sum_p);
- if( weight_divisor == 0 || (sum_e /= weight_divisor) > INT4_MAX ) {
- sum_e = INT4_MAX;
- }
- return sum_e;
- }
- /*
- Calculates the e-value if a collection of distinct alignments with
- arbitrarily large gaps between the alignments
- */
- double
- BLAST_LargeGapSumE(
- Blast_KarlinBlk * kbp, /* statistical parameters */
- Int2 num, /* the number of distinct alignments in this
- * collection */
- double score_prime, /* the sum of the scores of these alignments
- * each weighted by an appropriate value of
- * Lambda */
- Int4 query_length, /* the effective len of the query seq */
- Int4 subject_length, /* the effective len of the database seq */
- double weight_divisor) /* a divisor used to weight the e-value
- * when multiple collections of alignments
- * are being considered by the calling
- * routine */
- {
- double sum_p; /* The p-value of this set of alignments */
- double sum_e; /* The e-value of this set of alignments */
- /* The next two variables are for compatability with Warren's code. */
- double lcl_subject_length; /* query_length as a float */
- double lcl_query_length; /* subject_length as a float */
- lcl_query_length = (double) query_length;
- lcl_subject_length = (double) subject_length;
- score_prime -= num*(kbp->logK + log(lcl_subject_length*lcl_query_length))
- - BLAST_LnFactorial((double) num);
- sum_p = BlastSumP(num, score_prime);
- sum_e = BlastKarlinPtoE(sum_p);
- if( weight_divisor == 0 || (sum_e /= weight_divisor) > INT4_MAX ) {
- sum_e = INT4_MAX;
- }
- return sum_e;
- }
- /*------------------- RPS BLAST functions --------------------*/
- static double
- RPSfindUngappedLambda(Char *matrixName)
- {
- if (0 == strcmp(matrixName, "BLOSUM62"))
- return 0.3176;
- if (0 == strcmp(matrixName, "BLOSUM90"))
- return 0.3346;
- if (0 == strcmp(matrixName, "BLOSUM80"))
- return 0.3430;
- if (0 == strcmp(matrixName, "BLOSUM50"))
- return 0.232;
- if (0 == strcmp(matrixName, "BLOSUM45"))
- return 0.2291;
- if (0 == strcmp(matrixName, "PAM30"))
- return 0.340;
- if (0 == strcmp(matrixName, "PAM70"))
- return 0.3345;
- if (0 == strcmp(matrixName, "PAM250"))
- return 0.229;
- return(0);
- }
- /* matrix is a position-specific score matrix with matrixLength positions
- queryProbArray is an array containing the probability of occurrence
- of each residue in the query
- scoreArray is an array of probabilities for each score that is
- to be used as a field in return_sfp
- return_sfp is a the structure to be filled in and returned
- range is the size of scoreArray and is an upper bound on the
- difference between maximum score and minimum score in the matrix
- the routine fillSfp computes the probability of each score weighted
- by the probability of each query residue and fills those probabilities
- into scoreArray and puts scoreArray as a field in
- that in the structure that is returned
- for indexing convenience the field storing scoreArray points to the
- entry for score 0, so that referring to the -k index corresponds to
- score -k
- */
- static void
- RPSFillScores(Int4 **matrix, Int4 matrixLength,
- double *queryProbArray, double *scoreArray,
- Blast_ScoreFreq* return_sfp, Int4 range)
- {
- Int4 minScore, maxScore; /*observed minimum and maximum scores */
- Int4 i,j; /* indices */
- double recipLength; /* 1/matrix length as a double */
- minScore = maxScore = 0;
- for (i = 0; i < matrixLength; i++) {
- for (j = 0 ; j < PSI_ALPHABET_SIZE; j++) {
- if (j == AMINOACID_TO_NCBISTDAA['X'])
- continue;
- if ((matrix[i][j] > BLAST_SCORE_MIN) &&
- (matrix[i][j] < minScore))
- minScore = matrix[i][j];
- if (matrix[i][j] > maxScore)
- maxScore = matrix[i][j];
- }
- }
- return_sfp->obs_min = minScore;
- return_sfp->obs_max = maxScore;
- for (i = 0; i < range; i++)
- scoreArray[i] = 0.0;
- return_sfp->sprob = &(scoreArray[-minScore]); /*center around 0*/
- recipLength = 1.0 / (double) matrixLength;
- for(i = 0; i < matrixLength; i++) {
- for (j = 0; j < PSI_ALPHABET_SIZE; j++) {
- if (j == AMINOACID_TO_NCBISTDAA['X'])
- continue;
- if(matrix[i][j] >= minScore)
- return_sfp->sprob[matrix[i][j]] += recipLength *
- queryProbArray[j];
- }
- }
- return_sfp->score_avg = 0;
- for(i = minScore; i <= maxScore; i++)
- return_sfp->score_avg += i * return_sfp->sprob[i];
- }
- /* Given a sequence of 'length' amino acid residues, compute the
- probability of each residue and put that in the array resProb*/
- static void
- RPSFillResidueProbability(Uint1 * sequence, Int4 length, double * resProb)
- {
- Int4 frequency[PSI_ALPHABET_SIZE]; /*frequency of each letter*/
- Int4 i; /*index*/
- Int4 denominator; /*length not including X's*/
- denominator = length;
- for(i = 0; i < PSI_ALPHABET_SIZE; i++)
- frequency[i] = 0;
- for(i = 0; i < length; i++) {
- if (sequence[i] != AMINOACID_TO_NCBISTDAA['X'])
- frequency[sequence[i]]++;
- else
- denominator--;
- }
- for(i = 0; i < PSI_ALPHABET_SIZE; i++) {
- if (frequency[i] == 0)
- resProb[i] = 0.0;
- else
- resProb[i] = ((double) frequency[i]) /((double) denominator);
- }
- }
- /* Calculate a new PSSM, using composition-based statistics, for use
- with RPS BLAST. This function produces a PSSM for a single RPS DB
- sequence (of size db_seq_length) and incorporates information from
- the RPS blast query. Each individual database sequence must call this
- function to retrieve its own PSSM. The matrix is returned (and must
- be freed elsewhere). posMatrix is the portion of the complete
- concatenated PSSM that is specific to this DB sequence */
- Int4 **
- RPSCalculatePSSM(double scalingFactor, Int4 rps_query_length,
- Uint1 * rps_query_seq, Int4 db_seq_length,
- Int4 **posMatrix)
- {
- double *scoreArray; /*array of score probabilities*/
- double *resProb; /*array of probabilities for each residue*/
- Blast_ScoreFreq * return_sfp;/*score frequency pointers to compute lambda*/
- Int4* * returnMatrix; /*the PSSM to return */
- double initialUngappedLambda;
- double scaledInitialUngappedLambda;
- double correctUngappedLambda;
- double finalLambda;
- double temp; /*intermediate variable for adjusting matrix*/
- Int4 index, inner_index;
- resProb = (double *)malloc(PSI_ALPHABET_SIZE * sizeof(double));
- scoreArray = (double *)malloc(BLAST_SCORE_RANGE_MAX * sizeof(double));
- return_sfp = (Blast_ScoreFreq *)malloc(sizeof(Blast_ScoreFreq));
- RPSFillResidueProbability(rps_query_seq, rps_query_length, resProb);
- RPSFillScores(posMatrix, db_seq_length, resProb, scoreArray,
- return_sfp, BLAST_SCORE_RANGE_MAX);
- initialUngappedLambda = RPSfindUngappedLambda("BLOSUM62");
- scaledInitialUngappedLambda = initialUngappedLambda / scalingFactor;
- correctUngappedLambda = Blast_KarlinLambdaNR(return_sfp,
- scaledInitialUngappedLambda);
- sfree(resProb);
- sfree(scoreArray);
- sfree(return_sfp);
- if(correctUngappedLambda == -1.0)
- return NULL;
- finalLambda = correctUngappedLambda/scaledInitialUngappedLambda;
- returnMatrix = (Int4 **)_PSIAllocateMatrix((db_seq_length+1),
- PSI_ALPHABET_SIZE,
- sizeof(Int4));
- for (index = 0; index < db_seq_length+1; index++) {
- for (inner_index = 0; inner_index < PSI_ALPHABET_SIZE; inner_index++) {
- if (posMatrix[index][inner_index] <= BLAST_SCORE_MIN ||
- inner_index == AMINOACID_TO_NCBISTDAA['X']) {
- returnMatrix[index][inner_index] =
- posMatrix[index][inner_index];
- }
- else {
- temp = ((double)(posMatrix[index][inner_index])) * finalLambda;
- returnMatrix[index][inner_index] = BLAST_Nint(temp);
- }
- }
- }
- return returnMatrix;
- }
- /**
- * Computes the adjustment to the lengths of the query and database sequences
- * that is used to compensate for edge effects when computing evalues.
- *
- * The length adjustment is an integer-valued approximation to the fixed
- * point of the function
- *
- * f(ell) = beta +
- * (alpha/lambda) * (log K + log((m - ell)*(n - N ell)))
- *
- * where m is the query length n is the length of the database and N is the
- * number of sequences in the database. The values beta, alpha, lambda and
- * K are statistical, Karlin-Altschul parameters.
- *
- * The value of the length adjustment computed by this routine, A,
- * will always be an integer smaller than the fixed point of
- * f(ell). Usually, it will be the largest such integer. However, the
- * computed length adjustment, A, will also be so small that
- *
- * K * (m - A) * (n - N * A) > MAX(m,n).
- *
- * Moreover, an iterative method is used to compute A, and under
- * unusual circumstances the iterative method may not converge.
- *
- * @param K the statistical parameter K
- * @param logK the natural logarithm of K
- * @param alpha_d_lambda the ratio of the statistical parameters
- * alpha and lambda (for ungapped alignments, the
- * value 1/H should be used)
- * @param beta the statistical parameter beta (for ungapped
- * alignments, beta == 0)
- * @param query_length the length of the query sequence
- * @param db_length the length of the database
- * @param db_num_seqs the number of sequences in the database
- * @param length_adjustment the computed value of the length adjustment [out]
- *
- * @return 0 if length_adjustment is known to be the largest integer less
- * than the fixed point of f(ell); 1 otherwise.
- */
- Int4
- BLAST_ComputeLengthAdjustment(double K,
- double logK,
- double alpha_d_lambda,
- double beta,
- Int4 query_length,
- Int8 db_length,
- Int4 db_num_seqs,
- Int4 * length_adjustment)
- {
- Int4 i; /* iteration index */
- const Int4 maxits = 20; /* maximum allowed iterations */
- double m = (double) query_length;
- double n = (double) db_length;
- double N = (double) db_num_seqs;
- double ell; /* A float value of the length adjustment */
- double ss; /* effective size of the search space */
- double ell_min = 0, ell_max; /* At each iteration i,
- * ell_min <= ell <= ell_max. */
- Boolean converged = FALSE; /* True if the iteration converged */
- double ell_next = 0; /* Value the variable ell takes at iteration
- * i + 1 */
- /* Choose ell_max to be the largest nonnegative value that satisfies
- *
- * K * (m - ell) * (n - N * ell) > MAX(m,n)
- *
- * Use quadratic formula: 2 c /( - b + sqrt( b*b - 4 * a * c )) */
- { /* scope of a, mb, and c, the coefficients in the quadratic formula
- * (the variable mb is -b) */
- double a = N;
- double mb = m * N + n;
- double c = n * m - MAX(m, n) / K;
- if(c < 0) {
- *length_adjustment = 0;
- return 1;
- } else {
- ell_max = 2 * c / (mb + sqrt(mb * mb - 4 * a * c));
- }
- } /* end scope of a, mb and c */
- for(i = 1; i <= maxits; i++) { /* for all iteration indices */
- double ell_bar; /* proposed next value of ell */
- ell = ell_next;
- ss = (m - ell) * (n - N * ell);
- ell_bar = alpha_d_lambda * (logK + log(ss)) + beta;
- if(ell_bar >= ell) { /* ell is no bigger than the true fixed point */
- ell_min = ell;
- if(ell_bar - ell_min <= 1.0) {
- converged = TRUE;
- break;
- }
- if(ell_min == ell_max) { /* There are no more points to check */
- break;
- }
- } else { /* else ell is greater than the true fixed point */
- ell_max = ell;
- }
- if(ell_min <= ell_bar && ell_bar <= ell_max) {
- /* ell_bar is in range. Accept it */
- ell_next = ell_bar;
- } else { /* else ell_bar is not in range. Reject it */
- ell_next = (i == 1) ? ell_max : (ell_min + ell_max) / 2;
- }
- } /* end for all iteration indices */
- if(converged) { /* the iteration converged */
- /* If ell_fixed is the (unknown) true fixed point, then we
- * wish to set (*length_adjustment) to floor(ell_fixed). We
- * assume that floor(ell_min) = floor(ell_fixed) */
- *length_adjustment = (Int4) ell_min;
- /* But verify that ceil(ell_min) != floor(ell_fixed) */
- ell = ceil(ell_min);
- if( ell <= ell_max ) {
- ss = (m - ell) * (n - N * ell);
- if(alpha_d_lambda * (logK + log(ss)) + beta >= ell) {
- /* ceil(ell_min) == floor(ell_fixed) */
- *length_adjustment = (Int4) ell;
- }
- }
- } else { /* else the iteration did not converge. */
- /* Use the best value seen so far */
- *length_adjustment = (Int4) ell_min;
- }
- return converged ? 0 : 1;
- }
- /*
- * ===========================================================================
- *
- * $Log: blast_stat.c,v $
- * Revision 1000.6 2004/06/01 18:07:50 gouriano
- * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.77
- *
- * Revision 1.77 2004/05/24 15:09:40 camacho
- * Fixed conflict
- *
- * Revision 1.76 2004/05/24 13:26:27 madden
- * Fix PC compiler warnings
- *
- * Revision 1.75 2004/05/20 16:29:30 madden
- * Make searchsp an Int8 consistent with rest of blast
- *
- * Revision 1.74 2004/05/19 15:34:38 dondosha
- * Moved Blast_ResComp definition from header file
- *
- * Revision 1.73 2004/05/19 14:52:03 camacho
- * 1. Added doxygen tags to enable doxygen processing of algo/blast/core
- * 2. Standardized copyright, CVS $Id string, $Log and rcsid formatting and i
- * location
- * 3. Added use of @todo doxygen keyword
- *
- * Revision 1.72 2004/05/17 10:37:38 camacho
- * Rename BLAST_ScoreFreq, BLASTMatrixStructure and BLAST_ResComp to avoid conflicts with C toolkit
- *
- * Revision 1.71 2004/05/07 15:23:47 papadopo
- * add initialization of scale factor to ScoreBlkNew
- *
- * Revision 1.70 2004/05/06 15:59:29 camacho
- * Made Blast_KarlinBlkCalc non-static
- *
- * Revision 1.69 2004/05/06 15:05:13 camacho
- * Fix to previous commit
- *
- * Revision 1.68 2004/05/06 14:44:27 camacho
- * Made Blast_ScoreFreqDestruct non-static
- *
- * Revision 1.67 2004/05/05 21:16:24 camacho
- * Make Blast_GetStdAlphabet and Blast_ScoreFreqNew non-static
- *
- * Revision 1.66 2004/05/04 13:00:02 madden
- * Change BlastKarlinBlkStandardCalcEx to more descriptive Blast_KarlinBlkIdealCalc, make public
- *
- * Revision 1.65 2004/04/30 14:39:44 papadopo
- * 1. Remove unneeded #defines
- * 2. use BLAST_SCORE_RANGE_MAX during RPS PSSM creation instead of
- * (possibly incompatible) RPS_SCORE_MAX
- * 3. return NULL instead of FALSE on an error
- *
- * Revision 1.64 2004/04/30 12:58:49 camacho
- * Replace RPSKarlinLambdaNR by Blast_KarlinLambdaNR
- *
- * Revision 1.63 2004/04/29 20:32:38 papadopo
- * remove RPS_SCORE_MIN, since it turned out to be a workaround for a bug that has since been fixed
- *
- * Revision 1.62 2004/04/29 19:58:03 camacho
- * Use generic matrix allocator/deallocator from blast_psi_priv.h
- *
- * Revision 1.61 2004/04/28 14:40:23 madden
- * Changes from Mike Gertz:
- * - I created the new routine BLAST_GapDecayDivisor that computes a
- * divisor used to weight the evalue of a collection of distinct
- * alignments.
- * - I removed BLAST_GapDecay and BLAST_GapDecayInverse which had become
- * redundant.
- * - I modified the BLAST_Cutoffs routine so that it uses the value
- * returned by BLAST_GapDecayDivisor to weight evalues.
- * - I modified BLAST_SmallGapSumE, BLAST_LargeGapSumE and
- * BLAST_UnevenGapSumE no longer refer to the gap_prob parameter.
- * Replaced the gap_decay_rate parameter of each of these routines with
- * a weight_divisor parameter. Added documentation.
- *
- * Revision 1.60 2004/04/23 19:06:33 camacho
- * Do NOT use lowercase names for #defines
- *
- * Revision 1.59 2004/04/23 13:49:20 madden
- * Cleaned up ifndef in BlastKarlinLHtoK
- *
- * Revision 1.58 2004/04/23 13:21:25 madden
- * Rewrote BlastKarlinLHtoK to do the following and more:
- * 1. fix a bug whereby the wrong formula was used when high score == 1
- * and low score == -1;
- * 2. fix a methodological error of truncating the first sum
- * and trying to make it converge quickly by adding terms
- * of a geometric progression, even though the geometric progression
- * estimate is not correct in all cases;
- * the old adjustment code is left in for historical purposes but
- * #ifdef'd out
- * 3. Eliminate the Boolean bi_modal_score variable. The old test that
- * set the value of bi_modal_score would frequently fail to choose the
- * correct value due to rounding error.
- * 4. changed numerous local variable names to make them more meaningful;
- * 5. added substantial comments to explain what the procedure
- * is doing and what each variable represents
- *
- * Revision 1.57 2004/04/19 12:58:18 madden
- * Changed BLAST_KarlinBlk to Blast_KarlinBlk to avoid conflict with blastkar.h structure, renamed some functions to start with Blast_Karlin, made Blast_KarlinBlkDestruct public
- *
- * Revision 1.56 2004/04/12 18:57:31 madden
- * Rename BLAST_ResFreq to Blast_ResFreq, make Blast_ResFreqNew, Blast_ResFreqDestruct, and Blast_ResFreqStdComp non-static
- *
- * Revision 1.55 2004/04/08 13:53:10 papadopo
- * fix doxygen warning
- *
- * Revision 1.54 2004/04/07 03:06:16 camacho
- * Added blast_encoding.[hc], refactoring blast_stat.[hc]
- *
- v * Revision 1.53 2004/04/05 18:53:35 madden
- * Set dimensions if matrix from memory
- *
- * Revision 1.52 2004/04/01 14:14:02 lavr
- * Spell "occurred", "occurrence", and "occurring"
- *
- * Revision 1.51 2004/03/31 17:50:09 papadopo
- * Mike Gertz' changes for length adjustment calculations
- *
- * Revision 1.50 2004/03/11 18:52:41 camacho
- * Remove THREADS_IMPLEMENTED
- *
- * Revision 1.49 2004/03/10 18:00:06 camacho
- * Remove outdated references to blastkar
- *
- * Revision 1.48 2004/03/05 17:52:33 papadopo
- * Allow 32-bit context numbers for queries
- *
- * Revision 1.47 2004/03/04 21:07:51 papadopo
- * add RPS BLAST functionality
- *
- * Revision 1.46 2004/02/19 21:16:48 dondosha
- * Use enum type for severity argument in Blast_MessageWrite
- *
- * Revision 1.45 2003/12/05 16:03:57 camacho
- * Remove compiler warnings
- *
- * Revision 1.44 2003/11/28 22:39:11 camacho
- * + static keyword to BlastKarlinLtoH
- *
- * Revision 1.43 2003/11/28 15:03:48 camacho
- * Added static keyword to BlastKarlinLtoH
- *
- * Revision 1.42 2003/11/26 19:12:13 madden
- * code to simplify some routines and use NlmKarlinLambdaNR in place of BlastKarlinLambdaBis (following Mike Gertzs changes to blastkar.c )
- *
- * Revision 1.41 2003/11/24 23:18:32 dondosha
- * Added gap_decay_rate argument to BLAST_Cutoffs; removed BLAST_Cutoffs_simple
- *
- * Revision 1.40 2003/11/19 15:17:42 dondosha
- * Removed unused members from Karlin block structure
- *
- * Revision 1.39 2003/10/16 15:55:22 coulouri
- * fix uninitialized variables
- *
- * Revision 1.38 2003/10/16 15:52:08 coulouri
- * fix uninitialized variables
- *
- * Revision 1.37 2003/10/15 16:59:43 coulouri
- * type correctness fixes
- *
- * Revision 1.36 2003/10/02 22:08:34 dondosha
- * Corrections for one-strand translated searches
- *
- * Revision 1.35 2003/09/26 19:01:59 madden
- * Prefix ncbimath functions with BLAST_
- *
- * Revision 1.34 2003/09/09 14:21:39 coulouri
- * change blastkar.h to blast_stat.h
- *
- * Revision 1.33 2003/09/02 21:12:07 camacho
- * Fix small memory leak
- *
- * Revision 1.32 2003/08/26 15:23:51 dondosha
- * Rolled back previous change as it is not necessary any more
- *
- * Revision 1.31 2003/08/25 22:29:07 dondosha
- * Default matrix loading is defined only in C++ toolkit
- *
- * Revision 1.30 2003/08/25 18:05:41 dondosha
- * Moved assert statement after variables declarations
- *
- * Revision 1.29 2003/08/25 16:23:33 camacho
- * +Loading protein scoring matrices from utils/tables
- *
- * Revision 1.28 2003/08/11 15:01:59 dondosha
- * Added algo/blast/core to all #included headers
- *
- * Revision 1.27 2003/08/01 17:27:04 dondosha
- * Renamed external functions to avoid collisions with ncbitool library; made other functions static
- *
- * Revision 1.26 2003/07/31 18:48:49 dondosha
- * Use Int4 instead of BLAST_Score
- *
- * Revision 1.25 2003/07/31 17:48:06 madden
- * Remove call to FileLength
- *
- * Revision 1.24 2003/07/31 14:31:41 camacho
- * Replaced Char for char
- *
- * Revision 1.23 2003/07/31 14:19:28 camacho
- * Replaced FloatHi for double
- *
- * Revision 1.22 2003/07/31 00:32:37 camacho
- * Eliminated Ptr notation
- *
- * Revision 1.21 2003/07/30 22:08:09 dondosha
- * Process of finding path to the matrix is moved out of the blast library
- *
- * Revision 1.20 2003/07/30 21:52:41 camacho
- * Follow conventional structure definition
- *
- * Revision 1.19 2003/07/30 19:39:14 camacho
- * Remove PNTRs
- *
- * Revision 1.18 2003/07/30 17:58:25 dondosha
- * Changed ValNode to ListNode
- *
- * Revision 1.17 2003/07/30 17:15:00 dondosha
- * Minor fixes for very strict compiler warnings
- *
- * Revision 1.16 2003/07/30 17:06:40 camacho
- * Removed old cvs log
- *
- * Revision 1.15 2003/07/30 16:32:02 madden
- * Use ansi functions when possible
- *
- * Revision 1.14 2003/07/30 15:29:37 madden
- * Removed MemSets
- *
- * Revision 1.13 2003/07/29 14:42:31 coulouri
- * use strdup() instead of StringSave()
- *
- * Revision 1.12 2003/07/28 19:04:15 camacho
- * Replaced all MemNews for calloc
- *
- * Revision 1.11 2003/07/28 03:41:49 camacho
- * Use f{open,close,gets} instead of File{Open,Close,Gets}
- *
- * Revision 1.10 2003/07/25 21:12:28 coulouri
- * remove constructions of the form "return sfree();" and "a=sfree(a);"
- *
- * Revision 1.9 2003/07/25 18:58:43 camacho
- * Avoid using StrUpper and StringHasNoText
- *
- * Revision 1.8 2003/07/25 17:25:43 coulouri
- * in progres:
- * * use malloc/calloc/realloc instead of Malloc/Calloc/Realloc
- * * add sfree() macro and __sfree() helper function to util.[ch]
- * * use sfree() instead of MemFree()
- *
- * Revision 1.7 2003/07/24 22:37:33 dondosha
- * Removed some unused function parameters
- *
- * Revision 1.6 2003/07/24 22:01:44 camacho
- * Removed unused variables
- *
- * Revision 1.5 2003/07/24 21:31:06 dondosha
- * Changed to calls to BlastConstructErrorMessage to API from blast_message.h
- *
- * Revision 1.4 2003/07/24 20:38:30 dondosha
- * Removed LIBCALL etc. macros
- *
- * Revision 1.3 2003/07/24 17:37:46 dondosha
- * Removed MakeBlastScore function that is dependent on objalign.h
- *
- * Revision 1.2 2003/07/24 15:50:49 dondosha
- * Commented out mutex operations
- *
- * Revision 1.1 2003/07/24 15:18:09 dondosha
- * Copy of blastkar.h from ncbitools library, stripped of dependency on ncbiobj
- *
- * ===========================================================================
- */