aa_ungapped.c
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:18k
源码类别:

生物技术

开发平台:

C/C++

  1. /*
  2.  * ===========================================================================
  3.  * PRODUCTION $Log: aa_ungapped.c,v $
  4.  * PRODUCTION Revision 1000.2  2004/06/01 18:07:00  gouriano
  5.  * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.32
  6.  * PRODUCTION
  7.  * ===========================================================================
  8.  */
  9. /* $Id: aa_ungapped.c,v 1000.2 2004/06/01 18:07:00 gouriano Exp $
  10.  * ===========================================================================
  11.  *
  12.  *                            PUBLIC DOMAIN NOTICE
  13.  *               National Center for Biotechnology Information
  14.  *
  15.  *  This software/database is a "United States Government Work" under the
  16.  *  terms of the United States Copyright Act.  It was written as part of
  17.  *  the author's offical duties as a United States Government employee and
  18.  *  thus cannot be copyrighted.  This software/database is freely available
  19.  *  to the public for use. The National Library of Medicine and the U.S.
  20.  *  Government have not placed any restriction on its use or reproduction.
  21.  *
  22.  *  Although all reasonable efforts have been taken to ensure the accuracy
  23.  *  and reliability of the software and data, the NLM and the U.S.
  24.  *  Government do not and cannot warrant the performance or results that
  25.  *  may be obtained by using this software or data. The NLM and the U.S.
  26.  *  Government disclaim all warranties, express or implied, including
  27.  *  warranties of performance, merchantability or fitness for any particular
  28.  *  purpose.
  29.  *
  30.  *  Please cite the author in any work or product based on this material.
  31.  *
  32.  * ===========================================================================
  33.  */
  34. /** @file aa_ungapped.c
  35.  * @todo FIXME Need description
  36.  */
  37. static char const rcsid[] = 
  38.     "$Id: aa_ungapped.c,v 1000.2 2004/06/01 18:07:00 gouriano Exp $";
  39. #include <algo/blast/core/aa_ungapped.h>
  40. Int2 BlastAaWordFinder(BLAST_SequenceBlk* subject,
  41.                        BLAST_SequenceBlk* query,
  42.                        LookupTableWrap* lut_wrap,
  43.                        Int4** matrix,
  44.                        const BlastInitialWordParameters* word_params,
  45.                        Blast_ExtendWord* ewp,
  46.                        Uint4* query_offsets,
  47.                        Uint4* subject_offsets,
  48.                        Int4 offset_array_size,
  49.                        BlastInitHitList* init_hitlist, 
  50.                        BlastUngappedStats* ungapped_stats)
  51. {
  52.   Int2 status=0;
  53.   /* find the word hits and do ungapped extensions */
  54.   if (ewp->diag_table->multiple_hits)
  55.     {
  56.       status = BlastAaWordFinder_TwoHit(subject, query,
  57.                                         lut_wrap, ewp->diag_table,
  58.                                         matrix,
  59.                                         word_params->cutoff_score,
  60.                                         word_params->x_dropoff,
  61.                                         query_offsets, subject_offsets,
  62.                                         offset_array_size,
  63.                                         init_hitlist, ungapped_stats);
  64.     }
  65.   else
  66.     {
  67.       status = BlastAaWordFinder_OneHit(subject, query,
  68.                                         lut_wrap, ewp->diag_table,
  69.                                         matrix,
  70.                                         word_params->cutoff_score,
  71.                                         word_params->x_dropoff,
  72.                                         query_offsets, subject_offsets,
  73.                                         offset_array_size,
  74.                                         init_hitlist, ungapped_stats);
  75.     }
  76.   return status;
  77. }
  78. Int2 BlastAaWordFinder_TwoHit(const BLAST_SequenceBlk* subject,
  79.       const BLAST_SequenceBlk* query,
  80.       const LookupTableWrap* lookup_wrap,
  81.       BLAST_DiagTable* diag,
  82.       Int4 ** matrix,
  83.       Int4 cutoff,
  84.       Int4 dropoff,
  85.       Uint4 * NCBI_RESTRICT query_offsets,
  86.       Uint4 * NCBI_RESTRICT subject_offsets,
  87.       Int4 array_size,
  88.       BlastInitHitList* ungapped_hsps, 
  89.                BlastUngappedStats* ungapped_stats)
  90. {
  91.    LookupTable* lookup=NULL;
  92.    RPSLookupTable* rps_lookup=NULL;
  93.    Boolean use_pssm;
  94.    Int4 wordsize;
  95.    Int4 i;
  96.    Int4 hits=0;
  97.    Int4 totalhits=0;
  98.    Int4 first_offset = 0;
  99.    Int4 last_offset;
  100.    Int4 score;
  101.    Int4 hsp_q, hsp_s, hsp_len;
  102.    Int4 window;
  103.    Int4 last_hit, s_last_off, diff;
  104.    Int4 diag_offset, diag_coord, diag_mask;
  105.    DiagStruct* diag_array;
  106.    Boolean right_extend;
  107.    Int4 hits_extended = 0;
  108.    if (diag == NULL)
  109.       return -1;
  110.    diag_offset = diag->offset;
  111.    diag_array = diag->diag_array;
  112.    diag_mask = diag->diag_mask;
  113.    window = diag->window;
  114.    if (lookup_wrap->lut_type == RPS_LOOKUP_TABLE) {
  115.       rps_lookup = (RPSLookupTable *)lookup_wrap->lut;
  116.       wordsize = rps_lookup->wordsize;
  117.    }
  118.    else {
  119.       lookup = (LookupTable *)lookup_wrap->lut;
  120.       wordsize = lookup->wordsize;
  121.    }
  122.    last_offset  = subject->length - wordsize;
  123.    use_pssm = (rps_lookup != NULL) || (lookup->use_pssm);
  124.    while(first_offset <= last_offset) {
  125.       /* scan the subject sequence for hits */
  126.       if (rps_lookup)
  127.          hits = BlastRPSScanSubject(lookup_wrap, subject, &first_offset, 
  128.                                 query_offsets, subject_offsets, array_size);
  129.       else
  130.          hits = BlastAaScanSubject(lookup_wrap, subject, &first_offset, 
  131.                                 query_offsets, subject_offsets, array_size);
  132.       totalhits += hits;
  133.       /* for each hit, */
  134.       for (i = 0; i < hits; ++i)
  135.       {
  136.          /* calculate the diagonal associated with this query-subject
  137.             pair, and find the distance to the last hit on this diagonal */
  138.          diag_coord = 
  139.             (subject_offsets[i]  - query_offsets[i]) & diag_mask;
  140.          
  141.          last_hit = diag_array[diag_coord].last_hit - diag_offset;
  142.          diff = subject_offsets[i] - last_hit;
  143.          if (diff >= window) {
  144.             /* We are beyond the window for this diagonal; start a new hit */
  145.             diag_array[diag_coord].last_hit = subject_offsets[i] + diag_offset;
  146.          } 
  147.          else {
  148.             /* If the difference is negative, or is less than the 
  149.                wordsize (i.e. last hit and this hit overlap), give up */
  150.             if (diff < wordsize)
  151.                continue;
  152.             /* If a previous extension has not already 
  153.                covered this portion of the query and subject 
  154.                sequences, do the ungapped extension */
  155.             right_extend = TRUE;
  156.             if ((Int4)(subject_offsets[i] + diag_offset) > 
  157.                 diag_array[diag_coord].diag_level) {
  158.                /* Extend this pair of hits. The extension to the left must 
  159.                   reach the end of the first word in order for extension 
  160.                   to the right to proceed. */
  161.                hsp_len = 0;
  162.                score = BlastAaExtendTwoHit(matrix, subject, query,
  163.                                         last_hit + wordsize, 
  164.                                         subject_offsets[i], query_offsets[i], 
  165.                                         dropoff, &hsp_q, &hsp_s, 
  166.                                         &hsp_len, use_pssm,
  167.                                         wordsize, &right_extend, &s_last_off);
  168.                ++hits_extended;
  169.                /* if the hsp meets the score threshold, report it */
  170.                if (score >= cutoff)
  171.                   BlastSaveInitHsp(ungapped_hsps, hsp_q, hsp_s,
  172.                         query_offsets[i], subject_offsets[i], hsp_len, score);
  173.                /* whether or not the score was high enough, remember
  174.                   the rightmost subject word offset examined. Future hits must
  175.                   lie at least one letter to the right of this point */
  176.                diag_array[diag_coord].diag_level = 
  177.                         s_last_off - (wordsize - 1) + diag_offset;
  178.             }
  179.             /* If an extension to the right happened (or no extension
  180.                at all), reset the last hit so that future hits to this 
  181.                diagonal must start over. Otherwise make the present hit
  182.                into the previous hit for this diagonal */
  183.             if (right_extend)
  184.                diag_array[diag_coord].last_hit = 0;
  185.             else
  186.                diag_array[diag_coord].last_hit = 
  187.                         subject_offsets[i] + diag_offset;
  188.          } 
  189.       }/* end for - done with this batch of hits, call scansubject again. */
  190.    } /* end while - done with the entire sequence. */
  191.  
  192.    /* increment the offset in the diagonal array */
  193.    DiagUpdate(diag, subject->length + window); 
  194.  
  195.    Blast_UngappedStatsUpdate(ungapped_stats, totalhits, hits_extended, 
  196.                              ungapped_hsps->total);
  197.    return 0;
  198. }
  199. Int2 BlastAaWordFinder_OneHit(const BLAST_SequenceBlk* subject,
  200.       const BLAST_SequenceBlk* query,
  201.       const LookupTableWrap* lookup_wrap,
  202.       BLAST_DiagTable* diag,
  203.       Int4 ** matrix,
  204.       Int4 cutoff,
  205.       Int4 dropoff,
  206.       Uint4 * NCBI_RESTRICT query_offsets,
  207.       Uint4 * NCBI_RESTRICT subject_offsets,
  208.       Int4 array_size,
  209.                BlastInitHitList* ungapped_hsps, 
  210.                BlastUngappedStats* ungapped_stats)
  211. {
  212.    LookupTable* lookup=NULL;
  213.    RPSLookupTable* rps_lookup=NULL;
  214.    Boolean use_pssm;
  215.    Int4 wordsize;
  216.    Int4 hits=0;
  217.    Int4 totalhits=0;
  218.    Int4 first_offset = 0;
  219.    Int4 last_offset;
  220.    Int4 hsp_q, hsp_s, hsp_len;
  221.    Int4 s_last_off;
  222.    Int4 i;
  223.    Int4 score;
  224.    Int4 diag_offset, diag_coord, diag_mask, diff;
  225.    DiagStruct* diag_array;
  226.    Int4 hits_extended = 0;
  227.    if (!diag) 
  228.       return -1;
  229.    
  230.    diag_offset = diag->offset;
  231.    diag_array = diag->diag_array;
  232.    diag_mask = diag->diag_mask;
  233.    
  234.    if (lookup_wrap->lut_type == RPS_LOOKUP_TABLE) {
  235.       rps_lookup = (RPSLookupTable *)lookup_wrap->lut;
  236.       wordsize = rps_lookup->wordsize;
  237.    }
  238.    else {
  239.       lookup = (LookupTable *)lookup_wrap->lut;
  240.       wordsize = lookup->wordsize;
  241.    }
  242.    last_offset  = subject->length - wordsize;
  243.    use_pssm = (rps_lookup != NULL) || (lookup->use_pssm);
  244.    while(first_offset <= last_offset)
  245.    {
  246.       /* scan the subject sequence for hits */
  247.       if (rps_lookup)
  248.          hits = BlastRPSScanSubject(lookup_wrap, subject, &first_offset, 
  249.                                 query_offsets, subject_offsets, array_size);
  250.       else
  251.          hits = BlastAaScanSubject(lookup_wrap, subject, &first_offset,
  252. query_offsets, subject_offsets, array_size);
  253.       totalhits += hits;
  254.       /* for each hit, */
  255.       for (i = 0; i < hits; ++i) {
  256.          diag_coord = 
  257.             (subject_offsets[i]  - query_offsets[i]) & diag_mask;
  258.            
  259.          diff = subject_offsets[i] - 
  260.             (diag_array[diag_coord].diag_level - diag_offset);
  261.          /* do an extension, but only if we have not already extended
  262.             this far */
  263.          if (diff > 0) {
  264.             ++hits_extended;
  265.             score=BlastAaExtendOneHit(matrix, subject, query,
  266.                      subject_offsets[i], query_offsets[i], dropoff,
  267.                      &hsp_q, &hsp_s, &hsp_len, use_pssm, &s_last_off);
  268.             /* if the hsp meets the score threshold, report it */
  269.             if (score >= cutoff) {
  270.                BlastSaveInitHsp(ungapped_hsps, hsp_q, hsp_s, 
  271.                   query_offsets[i], subject_offsets[i], hsp_len, score);
  272.             }
  273.             diag_array[diag_coord].diag_level = 
  274.                   s_last_off - (lookup->wordsize - 1) + diag_offset;
  275.          }
  276.       } /* end for */
  277.    } /* end while */
  278.    /* increment the offset in the diagonal array (no windows used) */
  279.    DiagUpdate(diag, subject->length); 
  280.  
  281.    Blast_UngappedStatsUpdate(ungapped_stats, totalhits, hits_extended, 
  282.                              ungapped_hsps->total);
  283.    return 0;
  284. }
  285. Int4 BlastAaExtendRight(Int4 ** matrix,
  286. const BLAST_SequenceBlk* subject,
  287. const BLAST_SequenceBlk* query,
  288. Int4 s_off,
  289. Int4 q_off,
  290. Int4 dropoff,
  291. Int4* displacement,
  292.                         Int4 maxscore,
  293.                         Int4* s_last_off)
  294. {
  295.   Int4 i, n, best_i = -1;
  296.   Int4 score = maxscore;
  297.   Uint1* s,* q;
  298.   n = MIN( subject->length - s_off , query->length - q_off );
  299.   s=subject->sequence + s_off;
  300.   q=query->sequence + q_off;
  301.   for(i = 0; i < n; i++) {
  302.      score += matrix[ q[i] ] [ s[i] ];
  303.      if (score > maxscore) {
  304.         maxscore = score;
  305.         best_i = i;
  306.      }
  307.      if (score <= 0 || (maxscore - score) >= dropoff)
  308.         break;
  309.   }
  310.   *displacement = best_i;
  311.   *s_last_off = s_off + i;
  312.   return maxscore;
  313. }
  314. Int4 BlastAaExtendLeft(Int4 ** matrix,
  315.        const BLAST_SequenceBlk* subject,
  316.        const BLAST_SequenceBlk* query,
  317.        Int4 s_off,
  318.        Int4 q_off,
  319.        Int4 dropoff,
  320.        Int4* displacement)
  321. {
  322.    Int4 i, n, best_i;
  323.    Int4 score = 0;
  324.    Int4 maxscore = 0;
  325.    
  326.    Uint1* s,* q;
  327.    
  328.    n = MIN( s_off , q_off );
  329.    best_i = n + 1;
  330.    s = subject->sequence + s_off - n;
  331.    q = query->sequence + q_off - n;
  332.    
  333.    for(i = n; i >= 0; i--) {
  334.       score += matrix[ q[i] ] [ s[i] ];
  335.       
  336.       if (score > maxscore) {
  337.          maxscore = score;
  338.          best_i = i;
  339.       }
  340.       if ((maxscore - score) >= dropoff)
  341.          break;
  342.    }
  343.    
  344.    *displacement = n - best_i;
  345.    return maxscore;
  346. }
  347. Int4 BlastPSSMExtendRight(Int4 ** matrix,
  348. const BLAST_SequenceBlk* subject,
  349. Int4 query_size,
  350. Int4 s_off,
  351. Int4 q_off,
  352. Int4 dropoff,
  353. Int4* displacement,
  354.                         Int4 maxscore,
  355. Int4* s_last_off)
  356. {
  357.   Int4 i, n, best_i = -1;
  358.   Int4 score = maxscore;
  359.   Uint1* s;
  360.   n = MIN( subject->length - s_off , query_size - q_off );
  361.   s = subject->sequence + s_off;
  362.   for(i = 0; i < n; i++) {
  363.      score += matrix[ q_off+i ] [ s[i] ];
  364.      if (score > maxscore) {
  365.         maxscore = score;
  366.         best_i = i;
  367.      }
  368.      if (score <= 0 || (maxscore - score) >= dropoff)
  369.         break;
  370.   }
  371.   *displacement = best_i;
  372.   *s_last_off = s_off + i;
  373.   return maxscore;
  374. }
  375. Int4 BlastPSSMExtendLeft(Int4 ** matrix,
  376.        const BLAST_SequenceBlk* subject,
  377.        Int4 query_size,
  378.        Int4 s_off,
  379.        Int4 q_off,
  380.        Int4 dropoff,
  381.        Int4* displacement)
  382. {
  383.    Int4 i, n, best_i;
  384.    Int4 score = 0;
  385.    Int4 maxscore = 0;
  386.    Uint1* s;
  387.    
  388.    n = MIN( s_off , q_off );
  389.    best_i = n + 1;
  390.    s = subject->sequence + s_off - n;
  391.    
  392.    for(i = n; i >= 0; i--) {
  393.       score += matrix[ q_off-n+i ] [ s[i] ];
  394.       if (score > maxscore) {
  395.          maxscore = score;
  396.          best_i = i;
  397.       }
  398.       if ((maxscore - score) >= dropoff)
  399.          break;
  400.    }
  401.    
  402.    *displacement = n - best_i;
  403.    return maxscore;
  404. }
  405. Int4 BlastAaExtendOneHit(Int4 ** matrix,
  406.                          const BLAST_SequenceBlk* subject,
  407.                          const BLAST_SequenceBlk* query,
  408.                          Int4 s_off,
  409.                          Int4 q_off,
  410.                          Int4 dropoff,
  411.  Int4* hsp_q,
  412.  Int4* hsp_s,
  413.  Int4* hsp_len,
  414.                          Boolean use_pssm,
  415.                          Int4* s_last_off)
  416. {
  417.   Int4 left_score, right_score;
  418.   Int4 left_disp, right_disp;
  419.   if (use_pssm) {
  420.      left_score = BlastPSSMExtendLeft(matrix, subject, query->length, 
  421.                            s_off, q_off, dropoff, &left_disp);
  422.      right_score = BlastPSSMExtendRight(matrix, subject, query->length, 
  423.                            s_off+1, q_off+1, dropoff, &right_disp, 
  424.                            left_score, s_last_off);
  425.   }
  426.   else {
  427.      left_score = BlastAaExtendLeft(matrix, subject, query, 
  428.                             s_off, q_off, dropoff, &left_disp);
  429.      right_score = BlastAaExtendRight(matrix, subject, query, 
  430.                             s_off+1, q_off+1, dropoff, &right_disp, 
  431.                             left_score, s_last_off);
  432.   }
  433.   
  434.   *hsp_q   = q_off - left_disp;
  435.   *hsp_s   = s_off - left_disp;
  436.   *hsp_len = left_disp + right_disp + 2;
  437.   
  438.   return right_score;
  439. }
  440. Int4 
  441. BlastAaExtendTwoHit(Int4 ** matrix,
  442.                     const BLAST_SequenceBlk* subject,
  443.                     const BLAST_SequenceBlk* query,
  444.                     Int4 s_left_off,
  445.                     Int4 s_right_off,
  446.                     Int4 q_right_off,
  447.                     Int4 dropoff,
  448.                     Int4* hsp_q,
  449.                     Int4* hsp_s,
  450.                     Int4* hsp_len,
  451.                     Boolean use_pssm,
  452.                     Int4 word_size,
  453.                     Boolean *right_extend,
  454.                     Int4 *s_last_off)
  455. {
  456.    Int4 left_d = 0, right_d = 0; /* left and right displacements */
  457.    Int4 left_score = 0, right_score = 0; /* left and right scores */
  458.    Int4 i, score = 0;
  459.    Uint1 *s = subject->sequence;
  460.    Uint1 *q = query->sequence;
  461.    /* find the position (up to word_size-1 letters to the
  462.       right) that gives the largest starting score */
  463.    for (i = 0; i < word_size; i++) {
  464.       if (use_pssm)
  465.          score += matrix[ q_right_off+i ][ s[s_right_off+i] ];
  466.       else
  467.          score += matrix[ q[q_right_off+i] ][ s[s_right_off+i] ];
  468.       if (score > left_score) {
  469.          left_score = score;
  470.          right_d = i;
  471.       }
  472.    }
  473.    q_right_off += right_d;
  474.    s_right_off += right_d;
  475.    right_d = -1;
  476.    *right_extend = FALSE;
  477.    *s_last_off = s_right_off;
  478.    /* first, try to extend left, from the second hit to the first hit. */
  479.    if (use_pssm)
  480.       left_score = BlastPSSMExtendLeft(matrix, subject, query->length,
  481.           s_right_off, q_right_off, dropoff, &left_d);
  482.    else
  483.       left_score = BlastAaExtendLeft(matrix, subject, query,
  484.           s_right_off, q_right_off, dropoff, &left_d);
  485.    /* Extend to the right only if left extension reached the first hit. */
  486.    if (left_d >= (s_right_off - s_left_off)) {
  487.       *right_extend = TRUE;
  488.       if (use_pssm)
  489.          right_score = BlastPSSMExtendRight(matrix, subject, query->length,
  490.                                      s_right_off + 1, q_right_off + 1,
  491.                                      dropoff, &right_d, left_score, s_last_off);
  492.       else 
  493.          right_score = BlastAaExtendRight(matrix, subject, query,
  494.                                      s_right_off + 1, q_right_off + 1,
  495.                                      dropoff, &right_d, left_score, s_last_off);
  496.    }
  497.    *hsp_q = q_right_off - left_d;
  498.    *hsp_s = s_right_off - left_d;
  499.    *hsp_len = left_d + right_d + 2;
  500.    return MAX(left_score,right_score);
  501. }
  502. Int4 DiagUpdate(BLAST_DiagTable* diag, Int4 length)
  503. {
  504.   if (diag == NULL)
  505.     return 0;
  506.   if (diag->offset >= INT4_MAX/2)
  507.     {
  508.       DiagClear(diag);
  509.     }
  510.   else
  511.     {
  512.       diag->offset += length;
  513.     }
  514.   return 0;
  515. }
  516. Int4 DiagClear(BLAST_DiagTable* diag)
  517. {
  518.   Int4 i,n;
  519.   if (diag==NULL)
  520.     return 0;
  521.   n=diag->diag_array_length;
  522.   for(i=0;i<n;i++)
  523.     {
  524.       diag->diag_array[i].diag_level = 0;
  525.       diag->diag_array[i].last_hit = - diag->window;
  526.     }
  527.   return 0;
  528. }