multiAlign.c
资源名称:MSA_MST.rar [点击查看]
上传用户:szpanda
上传日期:2016-03-09
资源大小:9k
文件大小:11k
源码类别:
DNA
开发平台:
C/C++
- /* File: multiAlign.c */
- /* Author:Qichan Ma, Student No.:250494898 */
- #include "globals.h"
- #include "pairwiseAlign.h"
- #include "multiAlign.h"
- /* creat a new set whose only member is x */
- void Make_set(int x) {
- parent[x] = x;
- rank[x] = 0;
- }
- /* subroutine of Union */
- void Link(int x, int y) {
- if(rank[x] > rank[y])
- parent[y] = x;
- else if(rank[x] < rank[y])
- parent[x] = y;
- else if(x != y) {
- parent[y] = x;
- rank[x] += 1;
- }
- }
- /* unite the dynamic sets that contain x and y into one set */
- void Union(int x, int y) {
- Link(Find_set(x), Find_set(y));
- }
- /* returns a pointer to the representative of the set containing x */
- int Find_set(int x) {
- if(x != parent[x])
- parent[x] = Find_set(parent[x]);
- return parent[x];
- }
- /* adjust down the element in the heap */
- void AdjustDown (int r, int n)
- { /* r is the index of the element we need to adjust */
- int child;
- int temp=heap[r];
- child=2*r; /* child is the left child of r */
- while (child<=n) {
- if (( child < n) && ( e[heap[child]-1].cost < e[heap[child+1]-1].cost))
- child++;
- if (e[temp-1].cost >= e[heap[child]-1].cost)
- break;
- heap[child/2]=heap[child];
- child*=2;
- }
- heap[child/2]=temp;
- }
- /* bulid the heap */
- void BuildMaxHeap(int n)
- {
- int i;
- for(i=(int)n/2;i>0;i--)
- AdjustDown(i,n);
- }
- /* delete the minimal element from the heap */
- int DeleteMax() {
- int max=heap[1];
- heap[1]=heap[hsize--];/* put the last element on the first position */
- AdjustDown(1,hsize); /* adjust to minheap */
- return max;
- }
- /* do the multiple sequence alignment using Kruskal's maximum spanning tree algorithm */
- void MST_Kruskal() {
- int i, j;
- /* the number of the edges of the complete graph */
- int numOfEdge = numOfSeq*(numOfSeq-1)/2;
- e = (Edge *)malloc(numOfEdge*sizeof(Edge)); /* the edges array */
- parent = (int *)malloc(numOfSeq*sizeof(int)); /* the parent array */
- rank = (int *)malloc(numOfSeq*sizeof(int)); /* the rank array */
- heap = (int *)malloc((numOfEdge+1)*sizeof(int)); /* the heap used to sort edges and get the minimal cost edge */
- int k=0;
- /* initialize the edges with vertex and cost*/
- for(i=0; i<numOfSeq; i++){
- for(j=i+1; j<numOfSeq; j++) {
- e[k].u = i;
- e[k].v = j;
- e[k].cost = pairwiseScore(seq[i], seq[j]);
- k++;
- }
- }
- int maxMultiAlignLen = 0; /* the maximal length of the multiple alignments */
- for(i=0; i<numOfSeq; i++)
- maxMultiAlignLen += seqLen[i];
- /* initialize the multiAlign array */
- multiAlign = (char **)malloc(numOfSeq*sizeof(char *));
- for (i=0; i<numOfSeq; i++)
- multiAlign[i] = (char *)malloc(maxMultiAlignLen*sizeof(char));
- for(i=0; i<numOfSeq; i++){
- strcpy(multiAlign[i], seq[i]);
- }
- /* initialize the Union-Find sets */
- for(i=0; i<numOfSeq; i++)
- Make_set(i);
- /* initialize the heap */
- hsize=numOfEdge;
- for(i=1; i<=hsize; i++)
- heap[i]=i;
- BuildMaxHeap(hsize);
- int a, b, p, q, len1, len2, len3, block;
- while(hsize>0) {
- i=DeleteMax();
- a=Find_set(e[i-1].u);
- b=Find_set(e[i-1].v);
- /*if a,b are in the different sets, align the 2 sequences and unite the 2 sets with adjustment */
- if(a !=b ) {
- pairwiseAlign(seq[e[i-1].u], seq[e[i-1].v]);
- if(opt2 == 2) {
- fprintf(out, "align Sequences(%d, %d) with cost %dn", e[i-1].u+1, e[i-1].v+1, e[i-1].cost);
- fprintf(out, "the optiaml pairwise alignment of Sequences(%d, %d) with score %d:n", e[i-1].u+1, e[i-1].v+1, e[i-1].cost);
- len1 = strlen(align1);
- block = (len1/50)+1;
- for (q=0; q<block-1; q++) {
- fprintf(out, "(Sequence %2d) %4d ", e[i-1].u+1, q*50+1);
- for(j=q*50; j<(q+1)*50; j++)
- fprintf(out, "%c",align1[j]);
- fprintf(out, "n");
- fprintf(out, "(Sequence %2d) %4d ", e[i-1].v+1, q*50+1);
- for(j=q*50; j<(q+1)*50; j++)
- fprintf(out, "%c",align2[j]);
- fprintf(out, "nn");
- }
- if((block-1)*50 < len1) {
- fprintf(out, "(Sequence %2d) %4d ", e[i-1].u+1, (block-1)*50+1);
- for(j=(block-1)*50; j<len1; j++)
- fprintf(out, "%c",align1[j]);
- fprintf(out, "n");
- fprintf(out, "(Sequence %2d) %4d ", e[i-1].v+1, (block-1)*50+1);
- for(j=(block-1)*50; j<len1; j++)
- fprintf(out, "%c",align2[j]);
- fprintf(out, "nn");
- }
- }
- /* unite the 2 sets with adjustment */
- for(j=0; (align1[j]!= ' ' || multiAlign[e[i-1].u][j]!= ' ') || (align2[j]!= ' ' || multiAlign[e[i-1].v][j]!= ' '); j++) {
- len1=strlen(align1);
- len2=strlen(multiAlign[e[i-1].u]);
- len3=strlen(multiAlign[e[i-1].v]);
- if(align1[j]=='-' && multiAlign[e[i-1].u][j]!='-') {
- for(p=0; p<numOfSeq; p++) {
- if(Find_set(p) == a) {
- for(k=len2; k>=j; k--)
- multiAlign[p][k+1] = multiAlign[p][k];
- multiAlign[p][j]='-';
- }
- }
- }
- else if(align2[j]=='-' && multiAlign[e[i-1].v][j]!='-') {
- for(p=0; p<numOfSeq; p++) {
- if(Find_set(p) == b) {
- for(k=len3; k>=j; k--)
- multiAlign[p][k+1] = multiAlign[p][k];
- multiAlign[p][j]='-';
- }
- }
- }
- else if(multiAlign[e[i-1].u][j]=='-' && align1[j]!='-') {
- for(k=len1; k>=j; k--) {
- align1[k+1] = align1[k];
- align2[k+1] = align2[k];
- }
- align1[j] = align2[j] = '-';
- for(p=0; p<numOfSeq; p++) {
- if(Find_set(p) == b) {
- for(k=len3; k>=j; k--)
- multiAlign[p][k+1] = multiAlign[p][k];
- multiAlign[p][j]='-';
- }
- }
- }
- else if(multiAlign[e[i-1].v][j]=='-' && align2[j]!='-') {
- for(k=len1; k>=j; k--) {
- align1[k+1] = align1[k];
- align2[k+1] = align2[k];
- }
- align1[j] = align2[j] = '-';
- for(p=0; p<numOfSeq; p++) {
- if(Find_set(p) == a) {
- for(k=len2; k>=j; k--)
- multiAlign[p][k+1] = multiAlign[p][k];
- multiAlign[p][j]='-';
- }
- }
- }
- }
- Union(a,b); /* unite the 2 sets */
- /* output the current multiple alignment */
- if(opt2 == 2) {
- fprintf(out, "unite 2 sets and adjust the sequences in the sets, the current multiple alignment is:n");
- len1 = strlen(multiAlign[0]);
- block = (len1/50)+1;
- for (q=0; q<block-1; q++) {
- for(p=0; p<numOfSeq; p++) {
- if(Find_set(p)==a || Find_set(p)==b) {
- fprintf(out, "(Sequence %2d) %4d ", p+1, q*50+1);
- for(j=q*50; j<(q+1)*50; j++)
- fprintf(out, "%c",multiAlign[p][j]);
- fprintf(out, "n");
- }
- }
- fprintf(out, "n");
- }
- if((block-1)*50 < len1) {
- for(p=0; p<numOfSeq; p++) {
- if(Find_set(p)==a || Find_set(p)==b) {
- fprintf(out, "(Sequence %2d) %4d ", p+1, (block-1)*50+1);
- for(j=(block-1)*50; j<len1; j++)
- fprintf(out, "%c",multiAlign[p][j]);
- fprintf(out, "n");
- }
- }
- fprintf(out, "n");
- }
- fprintf(out, "nnn");
- }
- }
- }
- free(e);
- free(parent);
- free(rank);
- free(heap);
- }
- /* compute the pairwise alignment score */
- int AlignScore(char *x, char *y){
- int n = strlen(x);
- int sum=0, gap_ext=-2, gap_ini=-8;
- int i;
- if(x[0]!='-' && y[0]!='-')
- sum += score[alphaIndex[x[0]-65]][alphaIndex[y[0]-65]];
- else if((x[0]=='-' && y[0]!='-')||(y[0]=='-' && x[0]!='-'))
- sum += gap_ini+gap_ext;
- for(i=1; i<n; i++) {
- if (x[i]!='-' && y[i]!='-')
- sum += score[alphaIndex[x[i]-65]][alphaIndex[y[i]-65]];
- else if ((x[i]=='-' && y[i]!='-')||(y[i]=='-' && x[i]!='-')) {
- sum += gap_ext;
- if ((x[i]=='-' && x[i-1]!='-')||(y[i]=='-' && y[i-1]!='-')||(x[i-1]=='-' && y[i-1]=='-'))
- sum += gap_ini;
- }
- }
- return sum;
- }
- /* output the multiple alignment & SP score */
- void printMultiAlign() {
- int i, j, k ;
- /* compute and output the SP score of the multiple alignment */
- int sum=0;
- for(i=0; i<numOfSeq; i++){
- for(j=i+1; j<numOfSeq; j++) {
- sum += AlignScore(multiAlign[i], multiAlign[j]);
- }
- }
- fprintf(out, "The score of the multiple alignment using maximum spanning tree is: %dnn", sum);
- /* output the multiple alignment */
- fprintf(out, "The multiple sequence alignment using maximum spanning tree is:n");
- for(i=0; i<numOfSeq; i++){
- fprintf(out, "Sequence %2d: %sn", i+1, seqName[i]);
- }
- fprintf(out, "n");
- int len = strlen(multiAlign[0]);
- int block = (len/50)+1;
- for (k=0; k<block-1; k++) {
- for(i=0; i<numOfSeq; i++){
- fprintf(out, "(Sequence %2d) %4d ", i+1, k*50+1);
- for(j=k*50; j<(k+1)*50; j++)
- fprintf(out, "%c",multiAlign[i][j]);
- fprintf(out, "n");
- }
- fprintf(out, "n");
- }
- if((block-1)*50 < len) {
- for(i=0; i<numOfSeq; i++){
- fprintf(out, "(Sequence %2d) %4d ", i+1, (block-1)*50+1);
- for(j=(block-1)*50; j<len; j++)
- fprintf(out, "%c",multiAlign[i][j]);
- fprintf(out, "n");
- }
- fprintf(out, "n");
- }
- }