blast_psi_priv.c
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:51k
源码类别:

生物技术

开发平台:

C/C++

  1. /*
  2.  * ===========================================================================
  3.  * PRODUCTION $Log: blast_psi_priv.c,v $
  4.  * PRODUCTION Revision 1000.1  2004/06/01 18:07:34  gouriano
  5.  * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.6
  6.  * PRODUCTION
  7.  * ===========================================================================
  8.  */
  9. static char const rcsid[] =
  10.     "$Id: blast_psi_priv.c,v 1000.1 2004/06/01 18:07:34 gouriano Exp $";
  11. /* ===========================================================================
  12.  *
  13.  *                            PUBLIC DOMAIN NOTICE
  14.  *               National Center for Biotechnology Information
  15.  *
  16.  *  This software/database is a "United States Government Work" under the
  17.  *  terms of the United States Copyright Act.  It was written as part of
  18.  *  the author's official duties as a United States Government employee and
  19.  *  thus cannot be copyrighted.  This software/database is freely available
  20.  *  to the public for use. The National Library of Medicine and the U.S.
  21.  *  Government have not placed any restriction on its use or reproduction.
  22.  *
  23.  *  Although all reasonable efforts have been taken to ensure the accuracy
  24.  *  and reliability of the software and data, the NLM and the U.S.
  25.  *  Government do not and cannot warrant the performance or results that
  26.  *  may be obtained by using this software or data. The NLM and the U.S.
  27.  *  Government disclaim all warranties, express or implied, including
  28.  *  warranties of performance, merchantability or fitness for any particular
  29.  *  purpose.
  30.  *
  31.  *  Please cite the author in any work or product based on this material.
  32.  *
  33.  * ===========================================================================
  34.  *
  35.  * Author:  Alejandro Schaffer, ported by Christiam Camacho
  36.  *
  37.  */
  38. /** @file blast_psi_priv.c
  39.  * Defintions for functions in private interface for Position Iterated BLAST 
  40.  * API.
  41.  */
  42. #include "blast_psi_priv.h"
  43. #include "matrix_freq_ratios.h"
  44. /****************************************************************************/
  45. /* Constants */
  46. const unsigned int kQueryIndex = 0;
  47. const double kEpsilon = 0.0001;
  48. const double kDefaultEvalueForPosition = 1.0;
  49. const int kPsiScaleFactor = 200;
  50. /****************************************************************************/
  51. void**
  52. _PSIAllocateMatrix(unsigned int ncols, unsigned int nrows, 
  53.                    unsigned int data_type_sz)
  54. {
  55.     void** retval = NULL;
  56.     unsigned int i = 0;
  57.     if ( !(retval = (void**) malloc(sizeof(void*) * ncols)))
  58.         return NULL;
  59.     for (i = 0; i < ncols; i++) {
  60.         if ( !(retval[i] = (void*) calloc(nrows, data_type_sz))) {
  61.             retval = _PSIDeallocateMatrix(retval, i);
  62.             break;
  63.         }
  64.     }
  65.     return retval;
  66. }
  67. void**
  68. _PSIDeallocateMatrix(void** matrix, unsigned int ncols)
  69. {
  70.     unsigned int i = 0;
  71.     if (!matrix)
  72.         return NULL;
  73.     for (i = 0; i < ncols; i++) {
  74.         sfree(matrix[i]);
  75.     }
  76.     sfree(matrix);
  77.     return NULL;
  78. }
  79. void
  80. _PSICopyMatrix(double** dest, const double** src, 
  81.                unsigned int ncols, unsigned int nrows)
  82. {
  83.     unsigned int i = 0;
  84.     unsigned int j = 0;
  85.     for (i = 0; i < ncols; i++) {
  86.         for (j = 0; j < nrows; j++) {
  87.             dest[i][j] = src[i][j];
  88.         }
  89.     }
  90. }
  91. /****************************************************************************/
  92. PsiAlignedBlock*
  93. _PSIAlignedBlockNew(Uint4 num_positions)
  94. {
  95.     PsiAlignedBlock* retval = NULL;
  96.     Uint4 i = 0;
  97.     if ( !(retval = (PsiAlignedBlock*) calloc(1, sizeof(PsiAlignedBlock)))) {
  98.         return NULL;
  99.     }
  100.     retval->pos_extnt = (SSeqRange*) calloc(num_positions, sizeof(SSeqRange));
  101.     if ( !retval->pos_extnt ) {
  102.         return _PSIAlignedBlockFree(retval);
  103.     }
  104.     retval->size = (Uint4*) calloc(num_positions, sizeof(Uint4));
  105.     if ( !retval->size ) {
  106.         return _PSIAlignedBlockFree(retval);
  107.     }
  108.     for (i = 0; i < num_positions; i++) {
  109.         retval->pos_extnt[i].left = -1;
  110.         retval->pos_extnt[i].right = num_positions;
  111.     }
  112.     return retval;
  113. }
  114. PsiAlignedBlock*
  115. _PSIAlignedBlockFree(PsiAlignedBlock* aligned_blocks)
  116. {
  117.     if ( !aligned_blocks ) {
  118.         return NULL;
  119.     }
  120.     if (aligned_blocks->pos_extnt) {
  121.         sfree(aligned_blocks->pos_extnt);
  122.     }
  123.     if (aligned_blocks->size) {
  124.         sfree(aligned_blocks->size);
  125.     }
  126.     sfree(aligned_blocks);
  127.     return NULL;
  128. }
  129. PsiSequenceWeights*
  130. _PSISequenceWeightsNew(const PsiInfo* info, const BlastScoreBlk* sbp)
  131. {
  132.     PsiSequenceWeights* retval = NULL;
  133.     ASSERT(info);
  134.     retval = (PsiSequenceWeights*) calloc(1, sizeof(PsiSequenceWeights));
  135.     if ( !retval ) {
  136.         return NULL;
  137.     }
  138.     retval->row_sigma = (double*) calloc(info->num_seqs + 1, sizeof(double));
  139.     if ( !retval->row_sigma ) {
  140.         return _PSISequenceWeightsFree(retval);
  141.     }
  142.     retval->norm_seq_weights = (double*) calloc(info->num_seqs + 1, 
  143.                                                 sizeof(double));
  144.     if ( !retval->norm_seq_weights ) {
  145.         return _PSISequenceWeightsFree(retval);
  146.     }
  147.     retval->sigma = (double*) calloc(info->num_seqs + 1, sizeof(double));
  148.     if ( !retval->sigma ) {
  149.         return _PSISequenceWeightsFree(retval);
  150.     }
  151.     retval->match_weights = (double**) _PSIAllocateMatrix(info->query_sz + 1,
  152.                                                           PSI_ALPHABET_SIZE,
  153.                                                           sizeof(double));
  154.     retval->match_weights_size = info->query_sz + 1;
  155.     if ( !retval->match_weights ) {
  156.         return _PSISequenceWeightsFree(retval);
  157.     }
  158.     retval->std_prob = _PSIGetStandardProbabilities(sbp);
  159.     if ( !retval->std_prob ) {
  160.         return _PSISequenceWeightsFree(retval);
  161.     }
  162.     retval->info_content = (double*) calloc(info->query_sz, sizeof(double));
  163.     if ( !retval->info_content ) {
  164.         return _PSISequenceWeightsFree(retval);
  165.     }
  166.     return retval;
  167. }
  168. PsiSequenceWeights*
  169. _PSISequenceWeightsFree(PsiSequenceWeights* seq_weights)
  170. {
  171.     if ( !seq_weights ) {
  172.         return NULL;
  173.     }
  174.     if (seq_weights->row_sigma) {
  175.         sfree(seq_weights->row_sigma);
  176.     }
  177.     if (seq_weights->norm_seq_weights) {
  178.         sfree(seq_weights->norm_seq_weights);
  179.     }
  180.     if (seq_weights->sigma) {
  181.         sfree(seq_weights->sigma);
  182.     }
  183.     if (seq_weights->match_weights) {
  184.         _PSIDeallocateMatrix((void**) seq_weights->match_weights,
  185.                              seq_weights->match_weights_size);
  186.     }
  187.     if (seq_weights->std_prob) {
  188.         sfree(seq_weights->std_prob);
  189.     }
  190.     if (seq_weights->info_content) {
  191.         sfree(seq_weights->info_content);
  192.     }
  193.     sfree(seq_weights);
  194.     return NULL;
  195. }
  196. /****************************************************************************/
  197. /* Function prototypes */
  198. /* Purges any aligned segments which are identical to the query sequence */
  199. static void
  200. _PSIPurgeIdenticalAlignments(PsiAlignmentData* alignment);
  201. /* Keeps only one copy of any aligned sequences which are >PSI_NEAR_IDENTICAL%
  202.  * identical to one another */
  203. static void
  204. _PSIPurgeNearIdenticalAlignments(PsiAlignmentData* alignment);
  205. static void
  206. _PSIUpdatePositionCounts(PsiAlignmentData* alignment);
  207. /* FIXME: needs more descriptive name */
  208. static void
  209. _PSIPurgeSimilarAlignments(PsiAlignmentData* alignment,
  210.                            Uint4 seq_index1,
  211.                            Uint4 seq_index2,
  212.                            double max_percent_identity);
  213. /****************************************************************************/
  214. /**************** PurgeMatches stage of PSSM creation ***********************/
  215. int
  216. PSIPurgeBiasedSegments(PsiAlignmentData* alignment)
  217. {
  218.     if ( !alignment ) {
  219.         return PSIERR_BADPARAM;
  220.     }
  221.     _PSIPurgeIdenticalAlignments(alignment);
  222.     _PSIPurgeNearIdenticalAlignments(alignment);
  223.     _PSIUpdatePositionCounts(alignment);
  224.     return PSI_SUCCESS;
  225. }
  226. /** Remove those sequences which are identical to the query sequence 
  227.  * FIXME: Rename to _PSIPurgeSelfHits() ?
  228.  */
  229. static void
  230. _PSIPurgeIdenticalAlignments(PsiAlignmentData* alignment)
  231. {
  232.     Uint4 s = 0;        /* index on sequences */
  233.     ASSERT(alignment);
  234.     for (s = 0; s < alignment->dimensions->num_seqs + 1; s++) {
  235.         _PSIPurgeSimilarAlignments(alignment, kQueryIndex, s, PSI_IDENTICAL);
  236.     }
  237. }
  238. static void
  239. _PSIPurgeNearIdenticalAlignments(PsiAlignmentData* alignment)
  240. {
  241.     Uint4 i = 0;
  242.     Uint4 j = 0;
  243.     ASSERT(alignment);
  244.     for (i = 1; i < alignment->dimensions->num_seqs + 1; i++) { 
  245.         for (j = 1; (i + j) < alignment->dimensions->num_seqs + 1; j++) {
  246.             _PSIPurgeSimilarAlignments(alignment, i, (i + j), 
  247.                                        PSI_NEAR_IDENTICAL);
  248.         }
  249.     }
  250. }
  251. /** Counts the number of sequences matching the query per query position
  252.  * (columns of the multiple alignment) as well as the number of residues
  253.  * present in each position of the query.
  254.  * Should be called after multiple alignment data has been purged from biased
  255.  * sequences.
  256.  */
  257. static void
  258. _PSIUpdatePositionCounts(PsiAlignmentData* alignment)
  259. {
  260.     Uint4 s = 0;        /* index on aligned sequences */
  261.     Uint4 p = 0;        /* index on positions */
  262.     ASSERT(alignment);
  263.     for (s = kQueryIndex + 1; s < alignment->dimensions->num_seqs + 1; s++) {
  264.         if ( !alignment->use_sequences[s] )
  265.             continue;
  266.         for (p = 0; p < alignment->dimensions->query_sz; p++) {
  267.             if (alignment->desc_matrix[s][p].used) {
  268.                 const Uint1 res = alignment->desc_matrix[s][p].letter;
  269.                 if (res >= PSI_ALPHABET_SIZE) {
  270.                     continue;
  271.                 }
  272.                 alignment->res_counts[p][res]++;
  273.                 alignment->match_seqs[p]++;
  274.             }
  275.         }
  276.     }
  277. }
  278. /** This function compares the sequences in the alignment->desc_matrix
  279.  * structure indexed by sequence_index1 and seq_index2. If it finds aligned 
  280.  * regions that have a greater percent identity than max_percent_identity, 
  281.  * it removes the sequence identified by seq_index2.
  282.  */
  283. static void
  284. _PSIPurgeSimilarAlignments(PsiAlignmentData* alignment,
  285.                            Uint4 seq_index1,
  286.                            Uint4 seq_index2,
  287.                            double max_percent_identity)
  288. {
  289.     Uint4 i = 0;
  290.     /* Nothing to do if sequences are the same or not selected for further
  291.        processing */
  292.     if ( seq_index1 == seq_index2 ||
  293.          !alignment->use_sequences[seq_index1] ||
  294.          !alignment->use_sequences[seq_index2] ) {
  295.         return;
  296.     }
  297.     for (i = 0; i < alignment->dimensions->query_sz; i++) {
  298.         const PsiDesc* seq1 = alignment->desc_matrix[seq_index1];
  299.         const PsiDesc* seq2 = alignment->desc_matrix[seq_index2];
  300.         /* starting index of the aligned region */
  301.         Uint4 align_start = i;    
  302.         /* length of the aligned region */
  303.         Uint4 align_length = 0;   
  304.         /* # of identical residues in aligned region */
  305.         Uint4 nidentical = 0;     
  306.         /* both positions in the sequences must be used */
  307.         if ( !(seq1[i].used && seq2[i].used) ) {
  308.             continue;
  309.         }
  310.         /* examine the aligned region (FIXME: should we care about Xs?) */
  311.         while ( (i < alignment->dimensions->query_sz) && 
  312.                 (seq1[i].used && seq2[i].used)) {
  313.             if (seq1[i].letter == seq2[i].letter)
  314.                 nidentical++;
  315.             align_length++;
  316.             i++;
  317.         }
  318.         ASSERT(align_length != 0);
  319.         /* percentage of similarity of an aligned region between seq1 and 
  320.            seq2 */
  321.         {
  322.             double percent_identity = (double) nidentical / align_length;
  323.                 fprintf(stderr, "%f%% for seqs %d and %dn",
  324.                         percent_identity, seq_index1, seq_index2);
  325.             if (percent_identity >= max_percent_identity) {
  326.                 int rv = _PSIPurgeAlignedRegion(alignment, seq_index2, 
  327.                                                 align_start, 
  328.                                                 align_start+align_length);
  329.                 ASSERT(rv == PSI_SUCCESS);
  330.             }
  331.         }
  332.     }
  333. }
  334. /****************************************************************************/
  335. /* Function prototypes */
  336. static void
  337. _PSIGetLeftExtents(const PsiAlignmentData* alignment, Uint4 seq_index);
  338. static void
  339. _PSIGetRightExtents(const PsiAlignmentData* alignment, Uint4 seq_index);
  340. static void
  341. _PSIComputePositionExtents(const PsiAlignmentData* alignment, 
  342.                            Uint4 seq_index,
  343.                            PsiAlignedBlock* aligned_blocks);
  344. static void
  345. _PSIComputeAlignedRegionLengths(const PsiAlignmentData* alignment,
  346.                                 PsiAlignedBlock* aligned_blocks);
  347. /****************************************************************************/
  348. /******* Compute alignment extents stage of PSSM creation *******************/
  349. /* posComputeExtents in posit.c */
  350. int
  351. PSIComputeAlignmentBlocks(const PsiAlignmentData* alignment,        /* [in] */
  352.                           PsiAlignedBlock* aligned_blocks)          /* [out] */
  353. {
  354.     Uint4 s = 0;     /* index on aligned sequences */
  355.     if ( !alignment || !aligned_blocks ) {
  356.         return PSIERR_BADPARAM;
  357.     }
  358.     /* no need to compute extents for query sequence */
  359.     for (s = kQueryIndex + 1; s < alignment->dimensions->num_seqs + 1; s++) {
  360.         if ( !alignment->use_sequences[s] )
  361.             continue;
  362.         _PSIGetLeftExtents(alignment, s);
  363.         _PSIGetRightExtents(alignment, s);
  364.         _PSIComputePositionExtents(alignment, s, aligned_blocks);
  365.     }
  366.     _PSIComputeAlignedRegionLengths(alignment, aligned_blocks);
  367.     return PSI_SUCCESS;
  368. }
  369. static void
  370. _PSIGetLeftExtents(const PsiAlignmentData* alignment, Uint4 seq_index)
  371. {
  372.     const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
  373.     PsiDesc* sequence_position = NULL;
  374.     Uint4 prev = 0;  /* index for the first and previous position */
  375.     Uint4 curr = 0;  /* index for the current position */
  376.     ASSERT(alignment);
  377.     ASSERT(seq_index < alignment->dimensions->num_seqs + 1);
  378.     ASSERT(alignment->use_sequences[seq_index]);
  379.     sequence_position = alignment->desc_matrix[seq_index];
  380.     if (sequence_position[prev].used && sequence_position[prev].letter != GAP) {
  381.         sequence_position[prev].extents.left = prev;
  382.     }
  383.     for (curr = prev + 1; curr < alignment->dimensions->query_sz; 
  384.          curr++, prev++) {
  385.         if ( !sequence_position[curr].used ) {
  386.             continue;
  387.         }
  388.         if (sequence_position[prev].used) {
  389.             sequence_position[curr].extents.left =
  390.                 sequence_position[prev].extents.left;
  391.         } else {
  392.             sequence_position[curr].extents.left = curr;
  393.         }
  394.     }
  395. }
  396. static void
  397. _PSIGetRightExtents(const PsiAlignmentData* alignment, Uint4 seq_index)
  398. {
  399.     const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
  400.     PsiDesc* sequence_position = NULL;
  401.     Uint4 last = 0;      /* index for the last position */
  402.     Uint4 curr = 0;      /* index for the current position */
  403.     ASSERT(alignment);
  404.     ASSERT(seq_index < alignment->dimensions->num_seqs + 1);
  405.     ASSERT(alignment->use_sequences[seq_index]);
  406.     sequence_position = alignment->desc_matrix[seq_index];
  407.     last = alignment->dimensions->query_sz - 1;
  408.     if (sequence_position[last].used && sequence_position[last].letter != GAP) {
  409.         sequence_position[last].extents.right = last;
  410.     }
  411.     for (curr = last - 1; curr >= 0; curr--, last--) {
  412.         if ( !sequence_position[curr].used ) {
  413.             continue;
  414.         }
  415.         if (sequence_position[last].used) {
  416.             sequence_position[curr].extents.right =
  417.                 sequence_position[last].extents.right;
  418.         } else {
  419.             sequence_position[curr].extents.right = curr;
  420.         }
  421.     }
  422. }
  423. static void
  424. _PSIComputePositionExtents(const PsiAlignmentData* alignment, 
  425.                            Uint4 seq_index,
  426.                            PsiAlignedBlock* aligned_blocks)
  427. {
  428. #ifdef PSI_IGNORE_GAPS_IN_COLUMNS
  429.     const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
  430. #endif
  431.     PsiDesc* sequence_position = NULL;
  432.     Uint4 i = 0;
  433.     ASSERT(aligned_blocks);
  434.     ASSERT(alignment);
  435.     ASSERT(seq_index < alignment->dimensions->num_seqs + 1);
  436.     ASSERT(alignment->use_sequences[seq_index]);
  437.     sequence_position = alignment->desc_matrix[seq_index];
  438.     for (i = 0; i < alignment->dimensions->query_sz; i++) {
  439. #ifdef PSI_IGNORE_GAPS_IN_COLUMNS
  440.         if (sequence_position[i].used && 
  441.             sequence_position[i].letter != GAP) {
  442. #else
  443.         if (sequence_position[i].used) {
  444. #endif
  445.             aligned_blocks->pos_extnt[i].left = 
  446.                 MAX(aligned_blocks->pos_extnt[i].left, 
  447.                     sequence_position[i].extents.left);
  448.             aligned_blocks->pos_extnt[i].right = 
  449.                 MIN(aligned_blocks->pos_extnt[i].right, 
  450.                     sequence_position[i].extents.right);
  451.         }
  452.     }
  453. }
  454. static void
  455. _PSIComputeAlignedRegionLengths(const PsiAlignmentData* alignment,
  456.                                 PsiAlignedBlock* aligned_blocks)
  457. {
  458.     const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];
  459.     PsiDesc* query_seq = NULL;
  460.     Uint4 i = 0;
  461.     
  462.     ASSERT(alignment);
  463.     ASSERT(aligned_blocks);
  464.     for (i = 0; i < alignment->dimensions->query_sz; i++) {
  465.         aligned_blocks->size[i] = aligned_blocks->pos_extnt[i].right - 
  466.                                    aligned_blocks->pos_extnt[i].left;
  467.     }
  468.     query_seq = alignment->desc_matrix[kQueryIndex];
  469.     /* Do not include X's in aligned region lengths */
  470.     for (i = 0; i < alignment->dimensions->query_sz; i++) {
  471.         if (query_seq[i].letter == X) {
  472.             Uint4 idx = 0;
  473.             for (idx = 0; idx < i; idx++) {
  474.                 if ((Uint4)aligned_blocks->pos_extnt[idx].right >= i &&
  475.                     query_seq[idx].letter != X) {
  476.                     aligned_blocks->size[idx]--;
  477.                 }
  478.             }
  479.             for (idx = alignment->dimensions->query_sz - 1; idx > i; idx--) {
  480.                 if ((Uint4)aligned_blocks->pos_extnt[idx].left <= i &&
  481.                     query_seq[idx].letter != X) {
  482.                     aligned_blocks->size[idx]--;
  483.                 }
  484.             }
  485.         }
  486.     }
  487. }
  488. /****************************************************************************/
  489. static Uint4
  490. _PSIGetAlignedSequencesForPosition(
  491.     const PsiAlignmentData* alignment, 
  492.     Uint4 position,
  493.     Uint4* aligned_sequences);
  494. static void
  495. _PSICalculatePositionWeightsAndIntervalSigmas(
  496.     const PsiAlignmentData* alignment,     /* [in] */
  497.     const PsiAlignedBlock* aligned_blocks, /* [in] */
  498.     Uint4 position,                        /* [in] */
  499.     const Uint4* aligned_seqs,             /* [in] */
  500.     Uint4 num_aligned_seqs,                /* [in] */
  501.     PsiSequenceWeights* seq_weights,       /* [out] */
  502.     double* sigma,                         /* [out] */
  503.     double* interval_sigma);               /* [out] */
  504. static void
  505. _PSICalculateNormalizedSequenceWeights(
  506.     const PsiAlignedBlock* aligned_blocks, /* [in] */
  507.     Uint4 position,                        /* [in] */
  508.     const Uint4* aligned_seqs,             /* [in] */
  509.     Uint4 num_aligned_seqs,                /* [in] */
  510.     double sigma,                          /* [in] */
  511.     PsiSequenceWeights* seq_weights);      /* [out] */
  512. static void
  513. _PSICalculateMatchWeights(
  514.     const PsiAlignmentData* alignment,  /* [in] */
  515.     Uint4 position,                     /* [in] */
  516.     const Uint4* aligned_seqs,          /* [in] */
  517.     Uint4 num_aligned_seqs,             /* [in] */
  518.     PsiSequenceWeights* seq_weights);   /* [out] */
  519. static int
  520. _PSICheckSequenceWeights(
  521.     const PsiAlignmentData* alignment,         /* [in] */
  522.     PsiSequenceWeights* seq_weights);          /* [in] */
  523. /****************************************************************************/
  524. /******* Calculate sequence weights stage of PSSM creation ******************/
  525. /* Needs the PsiAlignedBlock structure calculated in previous stage as well
  526.  * as PsiAlignmentData structure */
  527. int
  528. PSIComputeSequenceWeights(const PsiAlignmentData* alignment,        /* [in] */
  529.                           const PsiAlignedBlock* aligned_blocks,    /* [in] */
  530.                           PsiSequenceWeights* seq_weights)          /* [out] */
  531. {
  532.     Uint4* aligned_seqs = NULL;     /* list of indices of sequences
  533.                                               which participate in an
  534.                                               aligned position */
  535.     Uint4 pos = 0;                  /* position index */
  536.     ASSERT(alignment);
  537.     ASSERT(aligned_blocks);
  538.     ASSERT(seq_weights);
  539.     aligned_seqs = (Uint4*) calloc(alignment->dimensions->num_seqs + 1,
  540.                                           sizeof(Uint4));
  541.     if ( !aligned_seqs ) {
  542.         return PSIERR_OUTOFMEM;
  543.     }
  544.     for (pos = 0; pos < alignment->dimensions->query_sz; pos++) {
  545.         Uint4 num_aligned_seqs = 0;
  546.         double sigma = 0.0; /**< number of different characters
  547.                                   occurring in matches within a multiple
  548.                                   alignment block, excluding identical
  549.                                   columns */
  550.         double interval_sigma = 0.0; /**< same as sigma but includes identical
  551.                                   columns */
  552.         /* ignore positions of no interest */
  553.         if (aligned_blocks->size[pos] == 0 || alignment->match_seqs[pos] <= 1) {
  554.             continue;
  555.         }
  556.         memset((void*)aligned_seqs, 0, 
  557.                sizeof(Uint4) * (alignment->dimensions->num_seqs + 1));
  558.         num_aligned_seqs = _PSIGetAlignedSequencesForPosition(alignment, pos,
  559.                                                               aligned_seqs);
  560.         if (num_aligned_seqs <= 1) {
  561.             continue;
  562.         }
  563.         /* Skipping optimization about redundant sets */
  564.         /* if (newSequenceSet) in posit.c */
  565.         memset((void*)seq_weights->norm_seq_weights, 0, 
  566.                sizeof(double)*(alignment->dimensions->num_seqs+1));
  567.         memset((void*)seq_weights->row_sigma, 0,
  568.                sizeof(double)*(alignment->dimensions->num_seqs+1));
  569.         _PSICalculatePositionWeightsAndIntervalSigmas(alignment, 
  570.                                      aligned_blocks, pos, aligned_seqs, 
  571.                                      num_aligned_seqs, seq_weights, 
  572.                                      &sigma, &interval_sigma);
  573.         seq_weights->sigma[pos] = interval_sigma;
  574.         /* Populates norm_seq_weights */
  575.         _PSICalculateNormalizedSequenceWeights(aligned_blocks, pos, 
  576.                                  aligned_seqs, num_aligned_seqs, sigma,
  577.                                  seq_weights);
  578.         /* Uses seq_weights->norm_seq_weights to populate match_weights */
  579.         _PSICalculateMatchWeights(alignment, pos, aligned_seqs,
  580.                                   num_aligned_seqs, seq_weights);
  581.     }
  582.     sfree(aligned_seqs);
  583.     /* Check that the sequence weights add up to 1 in each column */
  584.     _PSICheckSequenceWeights(alignment, seq_weights);
  585.     /* Return seq_weights->match_weigths, should free others? FIXME: need to
  586.      * keep sequence weights for diagnostics for structure group */
  587.     return PSI_SUCCESS;
  588. }
  589. /* Calculates the position based weights using a modified version of the
  590.  * Henikoff's algorithm presented in "Position-based sequence weights".
  591.  * More documentation pending */
  592. static void
  593. _PSICalculatePositionWeightsAndIntervalSigmas(
  594.     const PsiAlignmentData* alignment,     /* [in] */
  595.     const PsiAlignedBlock* aligned_blocks, /* [in] */
  596.     Uint4 position,                        /* [in] */
  597.     const Uint4* aligned_seqs,             /* [in] */
  598.     Uint4 num_aligned_seqs,                /* [in] */
  599.     PsiSequenceWeights* seq_weights,       /* [out] */
  600.     double* sigma,                         /* [out] */
  601.     double* interval_sigma)                /* [out] */
  602. {
  603.     /** keeps track of how many occurrences of each residue was found at a
  604.      * given position */
  605.     Uint4 residue_counts[PSI_ALPHABET_SIZE];
  606.     Uint4 num_distinct_residues = 0; /**< number of distinct 
  607.                                               residues found at a given 
  608.                                               position */
  609.     Uint4 i = 0;         /**< index into aligned block for requested
  610.                                    position */
  611.     ASSERT(seq_weights);
  612.     ASSERT(sigma);
  613.     ASSERT(interval_sigma);
  614.     *sigma = 0.0;
  615.     *interval_sigma = 0.0;
  616.     for (i = 0; i < sizeof(residue_counts); i++) {
  617.         residue_counts[i] = 0;
  618.     }
  619.     for (i = (Uint4) aligned_blocks->pos_extnt[position].left; 
  620.          i <= (Uint4) aligned_blocks->pos_extnt[position].right; i++) {
  621.         Uint4 asi = 0;   /**< index into array of aligned sequences */
  622.         /**** Count number of occurring residues at a position ****/
  623.         for (asi = 0; asi < num_aligned_seqs; asi++) {
  624.             const Uint4 seq_idx = aligned_seqs[asi];
  625.             const Uint1 residue = 
  626.                 alignment->desc_matrix[seq_idx][i].letter;
  627.             if (residue_counts[residue] == 0) {
  628.                 num_distinct_residues++;
  629.             }
  630.             residue_counts[residue]++;
  631.         }
  632.         /**** END: Count number of occurring residues at a position ****/
  633.         /* FIXME: see Alejandro's email about this */
  634.         (*interval_sigma) += num_distinct_residues;
  635.         if (num_distinct_residues > 1) {    /* if this is not 1 */
  636.             (*sigma) += num_distinct_residues;
  637.         }
  638.         /* Calculate row_sigma */
  639.         for (asi = 0; asi < num_aligned_seqs; asi++) {
  640.             const Uint4 seq_idx = aligned_seqs[asi];
  641.             const Uint1 residue =
  642.                 alignment->desc_matrix[seq_idx][i].letter;
  643.             seq_weights->row_sigma[seq_idx] += 
  644.                 (1.0 / (double) 
  645.                  (residue_counts[residue] * num_distinct_residues) );
  646.         }
  647.     }
  648.     return;
  649. }
  650. /** Calculates the normalized sequence weights for the requested position */
  651. static void
  652. _PSICalculateNormalizedSequenceWeights(
  653.     const PsiAlignedBlock* aligned_blocks, /* [in] */
  654.     Uint4 position,                        /* [in] */
  655.     const Uint4* aligned_seqs,             /* [in] */
  656.     Uint4 num_aligned_seqs,                /* [in] */
  657.     double sigma,                          /* [in] */
  658.     PsiSequenceWeights* seq_weights)       /* [out] */
  659. {
  660.     Uint4 asi = 0;   /**< index into array of aligned sequences */
  661.     if (sigma > 0) {
  662.         double weight_sum = 0.0;
  663.         for (asi = 0; asi < num_aligned_seqs; asi++) {
  664.             const Uint4 seq_idx = aligned_seqs[asi];
  665.             seq_weights->norm_seq_weights[seq_idx] = 
  666.                 seq_weights->row_sigma[seq_idx] / 
  667.                 (aligned_blocks->pos_extnt[position].right -
  668.                  aligned_blocks->pos_extnt[position].left + 1);
  669. #ifndef PSI_IGNORE_GAPS_IN_COLUMNS
  670.             weight_sum += seq_weights->norm_seq_weights[seq_idx];
  671. #endif
  672.         }
  673.         for (asi = 0; asi < num_aligned_seqs; asi++) {
  674.             const Uint4 seq_idx = aligned_seqs[asi];
  675.             seq_weights->norm_seq_weights[seq_idx] /= weight_sum;
  676.         }
  677.     } else {
  678.         for (asi = 0; asi < num_aligned_seqs; asi++) {
  679.             const Uint4 seq_idx = aligned_seqs[asi];
  680.             seq_weights->norm_seq_weights[seq_idx] = 
  681.                 (1.0/(double) num_aligned_seqs);
  682.         }
  683.     }
  684. }
  685. static void
  686. _PSICalculateMatchWeights(
  687.     const PsiAlignmentData* alignment,  /* [in] */
  688.     Uint4 position,                     /* [in] */
  689.     const Uint4* aligned_seqs,          /* [in] */
  690.     Uint4 num_aligned_seqs,             /* [in] */
  691.     PsiSequenceWeights* seq_weights)    /* [out] */
  692. {
  693.     const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
  694.     Uint4 asi = 0;   /**< index into array of aligned sequences */
  695.     for (asi = 0; asi < num_aligned_seqs; asi++) {
  696.         const Uint4 seq_idx = aligned_seqs[asi];
  697.         const Uint1 residue =
  698.             alignment->desc_matrix[seq_idx][position].letter;
  699.         seq_weights->match_weights[position][residue] += 
  700.             seq_weights->norm_seq_weights[seq_idx];
  701.         /* FIXME: this field is populated but never used */
  702.         if (residue == GAP) {
  703.             /*seq_weights->gapless_column_weights[position] +=
  704.              * seq_weights->a[seq_idx]; */
  705.             ;
  706.         }
  707.     }
  708. }
  709. /** Finds the sequences aligned in a given position.
  710.  * @param alignment Multiple-alignment data [in]
  711.  * @param position position of interest [in]
  712.  * @param aligned_sequences array which will contain the indices of the
  713.  * sequences aligned at the requested position. This array must have size
  714.  * greater than or equal to the number of sequences + 1 in multiple alignment 
  715.  * data structure (alignment->dimensions->num_seqs + 1) [out]
  716.  * @return number of sequences aligned at the requested position
  717.  */
  718. static Uint4
  719. _PSIGetAlignedSequencesForPosition(const PsiAlignmentData* alignment, 
  720.                                    Uint4 position,
  721.                                    Uint4* aligned_sequences)
  722. {
  723. #ifdef PSI_IGNORE_GAPS_IN_COLUMNS
  724.     const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
  725. #endif
  726.     Uint4 retval = 0;
  727.     Uint4 i = 0;
  728.     ASSERT(alignment);
  729.     ASSERT(position < alignment->dimensions->query_sz);
  730.     ASSERT(aligned_sequences);
  731.     for (i = 0; i < alignment->dimensions->num_seqs + 1; i++) {
  732.         if ( !alignment->use_sequences[i] ) {
  733.             continue;
  734.         }
  735. #ifdef PSI_IGNORE_GAPS_IN_COLUMNS
  736.         if (alignment->desc_matrix[i][position].used &&
  737.             alignment->desc_matrix[i][position].letter != GAP) {
  738. #else
  739.         if (alignment->desc_matrix[i][position].used) {
  740. #endif
  741.             aligned_sequences[retval++] = i;
  742.         }
  743.     }
  744.     return retval;
  745. }
  746. /* The second parameter is not really const, it's updated! */
  747. static int
  748. _PSICheckSequenceWeights(const PsiAlignmentData* alignment,
  749.                          PsiSequenceWeights* seq_weights)
  750. {
  751.     const Uint1 GAP = AMINOACID_TO_NCBISTDAA['-'];
  752.     const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];
  753.     Uint4 pos = 0;   /* residue position (ie: column number) */
  754.     Uint4 res = 0;   /* residue */
  755.     ASSERT(alignment);
  756.     ASSERT(seq_weights);
  757.     for (pos = 0; pos < alignment->dimensions->query_sz; pos++) {
  758.         double running_total = 0.0;
  759.         if (alignment->match_seqs[pos] <= 1 ||
  760.             alignment->desc_matrix[kQueryIndex][pos].letter == X) {
  761.             continue;
  762.         }
  763.         for (res = 0; res < PSI_ALPHABET_SIZE; res++) {
  764.             running_total += seq_weights->match_weights[pos][res];
  765.         }
  766.         ASSERT(running_total < 0.99 || running_total > 1.01);
  767. #ifndef PSI_IGNORE_GAPS_IN_COLUMNS
  768.         /* Disperse method of spreading the gap weights */
  769.         for (res = 0; res < PSI_ALPHABET_SIZE; res++) {
  770.             if (seq_weights->std_prob[res] > kEpsilon) {
  771.                 seq_weights->match_weights[pos][res] += 
  772.                     (seq_weights->match_weights[pos][GAP] * 
  773.                      seq_weights->std_prob[res]);
  774.             }
  775.         }
  776. #endif
  777.         seq_weights->match_weights[pos][GAP] = 0.0;
  778.         running_total = 0.0;
  779.         for (res = 0; res < PSI_ALPHABET_SIZE; res++) {
  780.             running_total += seq_weights->match_weights[pos][res];
  781.         }
  782.         if (running_total < 0.99 || running_total > 1.01) {
  783.             return PSIERR_BADSEQWEIGHTS;
  784.         }
  785.     }
  786.     return PSI_SUCCESS;
  787. }
  788. /****************************************************************************/
  789. /******* Compute residue frequencies stage of PSSM creation *****************/
  790. /* port of posComputePseudoFreqs */
  791. int
  792. PSIComputeResidueFrequencies(const PsiAlignmentData* alignment,     /* [in] */
  793.                              const PsiSequenceWeights* seq_weights, /* [in] */
  794.                              const BlastScoreBlk* sbp,              /* [in] */
  795.                              const PsiAlignedBlock* aligned_blocks, /* [in] */
  796.                              const PSIBlastOptions* opts,           /* [in] */
  797.                              PsiMatrix* score_matrix)               /* [out] */
  798. {
  799.     const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];
  800.     Uint4 i = 0;             /* loop index into query positions */
  801.     SFreqRatios* freq_ratios;       /* matrix-specific frequency ratios */
  802.     if ( !alignment || !seq_weights || !sbp || 
  803.          !aligned_blocks || !opts || !score_matrix ) {
  804.         return PSIERR_BADPARAM;
  805.     }
  806.     freq_ratios = _PSIMatrixFrequencyRatiosNew(sbp->name);
  807.     for (i = 0; i < alignment->dimensions->query_sz; i++) {
  808.         Uint4 j = 0;     /* loop index into alphabet */
  809.         double info_sum = 0.0;  /* for information content - FIXME calculate
  810.                                    separately */
  811.         /* If there is an 'X' in the query sequence at position i... */
  812.         if (alignment->desc_matrix[kQueryIndex][i].letter == X) {
  813.             for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {
  814.                 score_matrix->res_freqs[i][j] = 0.0;
  815.             }
  816.         } else {
  817.             for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {
  818.                 if (seq_weights->std_prob[j] > kEpsilon) {
  819.                     Uint4 interval_size = 0; /* length of a block */
  820.                     Uint4 idx = 0;  /* loop index */
  821.                     double sigma = 0.0;    /* number of chars in an interval */
  822.                     double pseudo = 0.0;            /* intermediate term */
  823.                     double numerator = 0.0;         /* intermediate term */
  824.                     double denominator = 0.0;       /* intermediate term */
  825.                     double qOverPEstimate = 0.0;    /* intermediate term */
  826.                     /* changed to matrix specific ratios here May 2000 */
  827.                     for (idx = 0; idx < (Uint4) sbp->alphabet_size; idx++) {
  828.                         if (sbp->matrix[j][idx] != BLAST_SCORE_MIN) {
  829.                             pseudo += (seq_weights->match_weights[i][idx] *
  830.                                        freq_ratios->data[j][idx]);
  831.                         }
  832.                     }
  833.                     pseudo *= opts->pseudo_count;
  834.                     /* FIXME: document where this formula is coming from
  835.                      * (probably 2001 paper, p 2996) */
  836.                     sigma = seq_weights->sigma[i];
  837.                     interval_size = aligned_blocks->size[i];
  838.                     numerator = pseudo + 
  839.                         ((sigma/interval_size-1) * 
  840.                          seq_weights->match_weights[i][j] / 
  841.                          seq_weights->std_prob[j]);
  842.                     denominator = (sigma/interval_size-1) +
  843.                         opts->pseudo_count;
  844.                     qOverPEstimate = numerator/denominator;
  845.                     /* Note artificial multiplication by standard probability
  846.                      * to normalize */
  847.                     score_matrix->res_freqs[i][j] = qOverPEstimate *
  848.                         seq_weights->std_prob[j];
  849.                     if ( qOverPEstimate != 0.0 &&
  850.                          (seq_weights->std_prob[j] > kEpsilon) ) {
  851.                         info_sum += qOverPEstimate * seq_weights->std_prob[j] *
  852.                             log(qOverPEstimate)/NCBIMATH_LN2;
  853.                     }
  854.                 } else {
  855.                     score_matrix->res_freqs[i][j] = 0.0;
  856.                 } /* END: if (seq_weights->std_prob[j] > kEpsilon) { */
  857.             } /* END: for (j = 0; j < sbp->alphabet_size; j++) */
  858.         }
  859.         /* FIXME: Should move out the calculation of information content to its
  860.          * own function (see posFreqsToInformation)! */
  861.         seq_weights->info_content[i] = info_sum;
  862.     }
  863.     freq_ratios = _PSIMatrixFrequencyRatiosFree(freq_ratios);
  864.     return PSI_SUCCESS;
  865. }
  866. /****************************************************************************/
  867. /**************** Convert residue frequencies to PSSM stage *****************/
  868. /* FIXME: Answer questions
  869.    FIXME: need ideal_labmda, regular scoring matrix, length of query
  870. */
  871. int
  872. PSIConvertResidueFreqsToPSSM(PsiMatrix* score_matrix,           /* [in|out] */
  873.                              const Uint1* query,                /* [in] */
  874.                              const BlastScoreBlk* sbp,          /* [in] */
  875.                              const double* std_probs)           /* [in] */
  876. {
  877.     const Uint4 X = AMINOACID_TO_NCBISTDAA['X'];
  878.     const Uint4 Star = AMINOACID_TO_NCBISTDAA['*'];
  879.     Uint4 i = 0;
  880.     Uint4 j = 0;
  881.     SFreqRatios* std_freq_ratios = NULL;    /* only needed when there are not
  882.                                                residue frequencies for a given 
  883.                                                column */
  884.     double ideal_lambda;
  885.     if ( !score_matrix || !sbp || !std_probs )
  886.         return PSIERR_BADPARAM;
  887.     std_freq_ratios = _PSIMatrixFrequencyRatiosNew(sbp->name);
  888.     ideal_lambda = sbp->kbp_ideal->Lambda;
  889.     /* Each column is a position in the query */
  890.     for (i = 0; i < score_matrix->ncols; i++) {
  891.         /* True if all frequencies in column i are zero */
  892.         Boolean is_unaligned_column = TRUE;
  893.         const Uint4 query_res = query[i];
  894.         for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {
  895.             double qOverPEstimate = 0.0;
  896.             /* Division compensates for multiplication in previous function */
  897.             if (std_probs[j] > kEpsilon) {
  898.                 qOverPEstimate = 
  899.                     score_matrix->res_freqs[i][j] / std_probs[j];
  900.             }
  901.             if (is_unaligned_column && qOverPEstimate != 0.0) {
  902.                 is_unaligned_column = FALSE;
  903.             }
  904.             /* Populate scaled matrix */
  905.             if (qOverPEstimate == 0.0 || std_probs[j] < kEpsilon) {
  906.                 score_matrix->scaled_pssm[i][j] = BLAST_SCORE_MIN;
  907.             } else {
  908.                 double tmp = log(qOverPEstimate)/ideal_lambda;
  909.                 score_matrix->scaled_pssm[i][j] = (int)
  910.                     BLAST_Nint(kPsiScaleFactor * tmp);
  911.             }
  912.             if ( (j == X || j == Star) &&
  913.                  (sbp->matrix[query_res][X] != BLAST_SCORE_MIN) ) {
  914.                 score_matrix->scaled_pssm[i][j] = 
  915.                     sbp->matrix[query_res][j] * kPsiScaleFactor;
  916.             }
  917.         }
  918.         if (is_unaligned_column) {
  919.             for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {
  920.                 score_matrix->pssm[i][j] = sbp->matrix[query_res][j];
  921.                 if (sbp->matrix[query_res][j] != BLAST_SCORE_MIN) {
  922.                     double tmp = 
  923.                         kPsiScaleFactor * std_freq_ratios->bit_scale_factor *
  924.                         log(std_freq_ratios->data[query_res][j])/NCBIMATH_LN2;
  925.                     score_matrix->scaled_pssm[i][j] = BLAST_Nint(tmp);
  926.                 } else {
  927.                     score_matrix->scaled_pssm[i][j] = BLAST_SCORE_MIN;
  928.                 }
  929.             }
  930.         }
  931.     }
  932.     std_freq_ratios = _PSIMatrixFrequencyRatiosFree(std_freq_ratios);
  933.     /* Set the last column of the matrix to BLAST_SCORE_MIN (why?) */
  934.     for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {
  935.         score_matrix->scaled_pssm[score_matrix->ncols-1][j] = BLAST_SCORE_MIN;
  936.     }
  937.     return PSI_SUCCESS;
  938. }
  939. /****************************************************************************/
  940. /************************* Scaling of PSSM stage ****************************/
  941. /**
  942.  * @param initial_lambda_guess value to be used when calculating lambda if this
  943.  * is not null [in]
  944.  * @param sbp Score block structure where the calculated lambda and K will be
  945.  * returned
  946.  */
  947. void
  948. _PSIUpdateLambdaK(const int** pssm,              /* [in] */
  949.                   const Uint1* query,            /* [in] */
  950.                   Uint4 query_length,            /* [in] */
  951.                   const double* std_probs,       /* [in] */
  952.                   double* initial_lambda_guess,  /* [in] */
  953.                   BlastScoreBlk* sbp);           /* [in|out] */
  954. /* FIXME: change so that only lambda is calculated inside the loop that scales
  955.    the matrix and kappa is calculated before returning from this function.
  956.    Scaling factor should be optional argument to accomodate kappa.c's needs?
  957. */
  958. int
  959. PSIScaleMatrix(const Uint1* query,              /* [in] */
  960.                Uint4 query_length,              /* [in] */
  961.                const double* std_probs,         /* [in] */
  962.                double* scaling_factor,          /* [in] */
  963.                PsiMatrix* score_matrix,         /* [in|out] */
  964.                BlastScoreBlk* sbp)              /* [in|out] */
  965. {
  966.     Boolean first_time = TRUE;
  967.     Uint4 index = 0;     /* loop index */
  968.     int** scaled_pssm = NULL;
  969.     int** pssm = NULL;
  970.     double factor;
  971.     double factor_low;
  972.     double factor_high;
  973.     double new_lambda;      /* Karlin-Altschul parameter */
  974.     const double kPositPercent = 0.05;
  975.     const Uint4 kPositNumIterations = 10;
  976.     Boolean too_high = TRUE;
  977.     double ideal_lambda;
  978.     if ( !score_matrix || !sbp || !query || !std_probs )
  979.         return PSIERR_BADPARAM;
  980.     ASSERT(sbp->kbp_psi[0]);
  981.     scaled_pssm = score_matrix->scaled_pssm;
  982.     pssm = score_matrix->pssm;
  983.     ideal_lambda = sbp->kbp_ideal->Lambda;
  984.     /* FIXME: need to take scaling_factor into account */
  985.     factor = 1.0;
  986.     while (1) {
  987.         Uint4 i = 0;
  988.         Uint4 j = 0;
  989.         for (i = 0; i < score_matrix->ncols; i++) {
  990.             for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {
  991.                 if (scaled_pssm[i][j] != BLAST_SCORE_MIN) {
  992.                     pssm[i][j] = 
  993.                         BLAST_Nint(factor*scaled_pssm[i][j]/kPsiScaleFactor);
  994.                 } else {
  995.                     pssm[i][j] = BLAST_SCORE_MIN;
  996.                 }
  997.             }
  998.         }
  999.         if (scaling_factor) {
  1000.             double init_lambda_guess = 
  1001.                 sbp->kbp_psi[0]->Lambda / *scaling_factor;
  1002.             _PSIUpdateLambdaK((const int**)pssm, query, query_length, 
  1003.                               std_probs, &init_lambda_guess, sbp);
  1004.         } else {
  1005.             _PSIUpdateLambdaK((const int**)pssm, query, query_length, 
  1006.                               std_probs, NULL, sbp);
  1007.         }
  1008.         new_lambda = sbp->kbp_psi[0]->Lambda;
  1009.         if (new_lambda > ideal_lambda) {
  1010.             if (first_time) {
  1011.                 factor_high = 1.0 + kPositPercent;
  1012.                 factor = factor_high;
  1013.                 too_high = TRUE;
  1014.                 first_time = FALSE;
  1015.             } else {
  1016.                 if (too_high == FALSE) {
  1017.                     break;
  1018.                 }
  1019.                 factor_high += (factor_high - 1.0);
  1020.                 factor = factor_high;
  1021.             }
  1022.         } else if (new_lambda > 0) {
  1023.             if (first_time) {
  1024.                 factor_high = 1.0;
  1025.                 factor_low = 1.0 - kPositPercent;
  1026.                 factor = factor_low;
  1027.                 too_high = FALSE;
  1028.                 first_time = FALSE;
  1029.             } else {
  1030.                 if (too_high == TRUE) {
  1031.                     break;
  1032.                 }
  1033.                 factor_low += (factor_low - 1.0);
  1034.                 factor = factor_low;
  1035.             }
  1036.         } else {
  1037.             return PSIERR_POSITIVEAVGSCORE;
  1038.         }
  1039.     }
  1040.     /* Binary search for kPositNumIterations times */
  1041.     for (index = 0; index < kPositNumIterations; index++) {
  1042.         Uint4 i = 0;
  1043.         Uint4 j = 0;
  1044.         factor = (factor_high + factor_low)/2;
  1045.         for (i = 0; i < score_matrix->ncols; i++) {
  1046.             for (j = 0; j < (Uint4) sbp->alphabet_size; j++) {
  1047.                 if (scaled_pssm[i][j] != BLAST_SCORE_MIN) {
  1048.                     pssm[i][j] = 
  1049.                         BLAST_Nint(factor*scaled_pssm[i][j]/kPsiScaleFactor);
  1050.                 } else {
  1051.                     pssm[i][j] = BLAST_SCORE_MIN;
  1052.                 }
  1053.             }
  1054.         }
  1055.         if (scaling_factor) {
  1056.             double init_lambda_guess = 
  1057.                 sbp->kbp_psi[0]->Lambda / *scaling_factor;
  1058.             _PSIUpdateLambdaK((const int**)pssm, query, query_length, 
  1059.                               std_probs, &init_lambda_guess, sbp);
  1060.         } else {
  1061.             _PSIUpdateLambdaK((const int**)pssm, query, query_length, 
  1062.                               std_probs, NULL, sbp);
  1063.         }
  1064.         new_lambda = sbp->kbp_psi[0]->Lambda;
  1065.         if (new_lambda > ideal_lambda) {
  1066.             factor_low = factor;
  1067.         } else {
  1068.             factor_high = factor;
  1069.         }
  1070.     }
  1071.     /* FIXME: Why is this needed? */
  1072.     for (index = 0; index < (Uint4) sbp->alphabet_size; index++) {
  1073.         pssm[score_matrix->ncols-1][index] = BLAST_SCORE_MIN;
  1074.     }
  1075.     return PSI_SUCCESS;
  1076. }
  1077. Uint4
  1078. _PSISequenceLengthWithoutX(const Uint1* seq, Uint4 length)
  1079. {
  1080.     const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];
  1081.     Uint4 retval = 0;       /* the return value */
  1082.     Uint4 i = 0;            /* loop index */
  1083.     ASSERT(seq);
  1084.     for(i = 0; i < length; i++) {
  1085.         if (seq[i] != X) {
  1086.             retval++;
  1087.         }
  1088.     }
  1089.     return retval;
  1090. }
  1091. Blast_ScoreFreq*
  1092. _PSIComputeScoreProbabilities(const int** pssm,                     /* [in] */
  1093.                               const Uint1* query,                   /* [in] */
  1094.                               Uint4 query_length,                   /* [in] */
  1095.                               const double* std_probs,              /* [in] */
  1096.                               const BlastScoreBlk* sbp)             /* [in] */
  1097. {
  1098.     const Uint1 X = AMINOACID_TO_NCBISTDAA['X'];
  1099.     Uint1 aa_alphabet[BLASTAA_SIZE];            /* ncbistdaa alphabet */
  1100.     Uint4 effective_length = 0;                 /* length of query w/o Xs */
  1101.     Uint4 p = 0;                                /* index on positions */
  1102.     Uint4 c = 0;                                /* index on characters */
  1103.     int s = 0;                                  /* index on scores */
  1104.     int min_score = 0;                          /* minimum score in pssm */
  1105.     int max_score = 0;                          /* maximum score in pssm */
  1106.     short rv = 0;                               /* temporary return value */
  1107.     Blast_ScoreFreq* score_freqs = NULL;        /* score frequencies */
  1108.     ASSERT(pssm);
  1109.     ASSERT(query);
  1110.     ASSERT(std_probs);
  1111.     ASSERT(sbp);
  1112.     ASSERT(sbp->alphabet_code == BLASTAA_SEQ_CODE);
  1113.     rv = Blast_GetStdAlphabet(sbp->alphabet_code, aa_alphabet, BLASTAA_SIZE);
  1114.     if (rv <= 0) {
  1115.         return NULL;
  1116.     }
  1117.     ASSERT(rv == sbp->alphabet_size);
  1118.     effective_length = _PSISequenceLengthWithoutX(query, query_length);
  1119.     /* Get the minimum and maximum scores */
  1120.     for (p = 0; p < query_length; p++) {
  1121.         for (c = 0; c < (Uint4) sbp->alphabet_size; c++) {
  1122.             const int kScore = pssm[p][aa_alphabet[c]];
  1123.             if (kScore <= BLAST_SCORE_MIN || kScore >= BLAST_SCORE_MAX) {
  1124.                 continue;
  1125.             }
  1126.             max_score = MAX(kScore, max_score);
  1127.             min_score = MIN(kScore, min_score);
  1128.         }
  1129.     }
  1130.     if ( !(score_freqs = Blast_ScoreFreqNew(min_score, max_score)))
  1131.         return NULL;
  1132.     score_freqs->obs_min = min_score;
  1133.     score_freqs->obs_max = max_score;
  1134.     for (p = 0; p < query_length; p++) {
  1135.         if (query[p] == X) {
  1136.             continue;
  1137.         }
  1138.         for (c = 0; c < (Uint4) sbp->alphabet_size; c++) {
  1139.             const Uint4 kScore = pssm[p][aa_alphabet[c]];
  1140.             if (kScore <= BLAST_SCORE_MIN || kScore >= BLAST_SCORE_MAX) {
  1141.                 continue;
  1142.             }
  1143.             /* Increment the weight for the score in position p, residue c */
  1144.             score_freqs->sprob[kScore] += 
  1145.                 (std_probs[aa_alphabet[c]]/effective_length);
  1146.         }
  1147.     }
  1148.     for (s = min_score; s < max_score; s++) {
  1149.         score_freqs->score_avg += (s * score_freqs->sprob[s]);
  1150.     }
  1151.     return score_freqs;
  1152. }
  1153. void
  1154. _PSIUpdateLambdaK(const int** pssm,              /* [in] */
  1155.                   const Uint1* query,            /* [in] */
  1156.                   Uint4 query_length,            /* [in] */
  1157.                   const double* std_probs,       /* [in] */
  1158.                   double* initial_lambda_guess,  /* [in] */
  1159.                   BlastScoreBlk* sbp)            /* [in|out] */
  1160. {
  1161.     Blast_ScoreFreq* score_freqs = 
  1162.         _PSIComputeScoreProbabilities(pssm, query, query_length, 
  1163.                                       std_probs, sbp);
  1164.     if (initial_lambda_guess) {
  1165.         sbp->kbp_psi[0]->Lambda = Blast_KarlinLambdaNR(score_freqs, 
  1166.                                                        *initial_lambda_guess);
  1167.     } else {
  1168.         /* Calculate lambda and K */
  1169.         Blast_KarlinBlkCalc(sbp->kbp_psi[0], score_freqs);
  1170.         /* Shouldn't this be in a function? */
  1171.         sbp->kbp_gap_psi[0]->K = 
  1172.             sbp->kbp_psi[0]->K * sbp->kbp_gap_std[0]->K / sbp->kbp_ideal->K;
  1173.         sbp->kbp_gap_psi[0]->logK = log(sbp->kbp_gap_psi[0]->K);
  1174.     }
  1175.     score_freqs = Blast_ScoreFreqDestruct(score_freqs);
  1176. }
  1177. /****************************************************************************/
  1178. /* Function definitions for auxiliary functions for the stages above */
  1179. int
  1180. _PSIPurgeAlignedRegion(PsiAlignmentData* alignment,
  1181.                        unsigned int seq_index,
  1182.                        unsigned int start,
  1183.                        unsigned int stop)
  1184. {
  1185.     PsiDesc* sequence_position = NULL;
  1186.     unsigned int i = 0;
  1187.     if (!alignment)
  1188.         return PSIERR_BADPARAM;
  1189.     /* Cannot remove the query sequence from multiple alignment data or
  1190.        bad index */
  1191.     if (seq_index == kQueryIndex || 
  1192.         seq_index > alignment->dimensions->num_seqs + 1 ||
  1193.         stop > alignment->dimensions->query_sz)
  1194.         return PSIERR_BADPARAM;
  1195.     sequence_position = alignment->desc_matrix[seq_index];
  1196.     fprintf(stderr, "NEW purge %d (%d-%d)n", seq_index, start, stop);
  1197.     for (i = start; i < stop; i++) {
  1198.         sequence_position[i].letter = (unsigned char) -1;
  1199.         sequence_position[i].used = FALSE;
  1200.         sequence_position[i].e_value = kDefaultEvalueForPosition;
  1201.         sequence_position[i].extents.left = (unsigned int) -1;
  1202.         sequence_position[i].extents.right = alignment->dimensions->query_sz;
  1203.     }
  1204.     _PSIDiscardIfUnused(alignment, seq_index);
  1205.     return PSI_SUCCESS;
  1206. }
  1207. /* Check if we still need this sequence */
  1208. void
  1209. _PSIDiscardIfUnused(PsiAlignmentData* alignment, unsigned int seq_index)
  1210. {
  1211.     Boolean contains_aligned_regions = FALSE;
  1212.     unsigned int i = 0;
  1213.     for (i = 0; i < alignment->dimensions->query_sz; i++) {
  1214.         if (alignment->desc_matrix[seq_index][i].used) {
  1215.             contains_aligned_regions = TRUE;
  1216.             break;
  1217.         }
  1218.     }
  1219.     if ( !contains_aligned_regions ) {
  1220.         alignment->use_sequences[seq_index] = FALSE;
  1221.         fprintf(stderr, "NEW Removing sequence %dn", seq_index);
  1222.     }
  1223. }
  1224. /****************************************************************************/
  1225. double*
  1226. _PSIGetStandardProbabilities(const BlastScoreBlk* sbp)
  1227. {
  1228.     Blast_ResFreq* standard_probabilities = NULL;
  1229.     Uint4 i = 0;
  1230.     double* retval = NULL;
  1231.     if ( !(retval = (double*) malloc(sbp->alphabet_size * sizeof(double))))
  1232.         return NULL;
  1233.     standard_probabilities = Blast_ResFreqNew(sbp);
  1234.     Blast_ResFreqStdComp(sbp, standard_probabilities);
  1235.     for (i = 0; i < (Uint4) sbp->alphabet_size; i++) {
  1236.         retval[i] = standard_probabilities->prob[i];
  1237.     }
  1238.     Blast_ResFreqDestruct(standard_probabilities);
  1239.     return retval;
  1240. }
  1241. PsiDiagnostics*
  1242. _PSISaveDiagnostics(const PsiAlignmentData* alignment,
  1243.                     const PsiAlignedBlock* aligned_block,
  1244.                     const PsiSequenceWeights* seq_weights)
  1245. {
  1246.     /* _PSICalculateInformationContent(seq_weights); */
  1247.     abort();
  1248.     return NULL;
  1249. }
  1250. /*
  1251.  * ===========================================================================
  1252.  * $Log: blast_psi_priv.c,v $
  1253.  * Revision 1000.1  2004/06/01 18:07:34  gouriano
  1254.  * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.6
  1255.  *
  1256.  * Revision 1.6  2004/05/28 17:35:03  camacho
  1257.  * Fix msvc6 warnings
  1258.  *
  1259.  * Revision 1.5  2004/05/28 16:00:09  camacho
  1260.  * + first port of PSSM generation engine
  1261.  *
  1262.  * Revision 1.4  2004/05/19 14:52:02  camacho
  1263.  * 1. Added doxygen tags to enable doxygen processing of algo/blast/core
  1264.  * 2. Standardized copyright, CVS $Id string, $Log and rcsid formatting and i
  1265.  *    location
  1266.  * 3. Added use of @todo doxygen keyword
  1267.  *
  1268.  * Revision 1.3  2004/05/06 14:01:40  camacho
  1269.  * + _PSICopyMatrix
  1270.  *
  1271.  * Revision 1.2  2004/04/07 22:08:37  kans
  1272.  * needed to include blast_def.h for sfree prototype
  1273.  *
  1274.  * Revision 1.1  2004/04/07 19:11:17  camacho
  1275.  * Initial revision
  1276.  *
  1277.  * ===========================================================================
  1278.  */