alnread.c
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:173k
- return offset;
- }
- }
- return -1;
- }
- /* This function returns a pointer to the memory in which the ID for the
- * Nth sequence is stored, unless there aren't that many sequences, in which
- * case NULL is returned.
- */
- static char *
- s_GetAlignRawSeqIDByOffset
- (TAlignRawSeqPtr list,
- int offset)
- {
- TAlignRawSeqPtr arsp;
- int index;
- arsp = list;
- index = 0;
- while ( arsp != NULL && index != offset ) {
- arsp = arsp->next;
- index++;
- }
- if (index == offset && arsp != NULL) {
- return arsp->id;
- } else {
- return NULL;
- }
- }
- /* This function adds data to a sequence by looking for the specified ID in
- * the list. If the id is not found, a new sequence with that ID is added to
- * the end of the list.
- * The function returns a pointer to the first item in the list.
- */
- static TAlignRawSeqPtr
- s_AddAlignRawSeqById
- (TAlignRawSeqPtr list,
- char * id,
- char * data,
- int id_line_num,
- int data_line_num,
- int data_line_offset)
- {
- TAlignRawSeqPtr arsp;
- TIntLinkPtr ilp;
- arsp = s_FindAlignRawSeqById (list, id);
- if (arsp == NULL) {
- arsp = s_AlignRawSeqNew (list);
- if (arsp == NULL) {
- return NULL;
- }
- if (list == NULL) list = arsp;
- arsp->id = strdup (id);
- }
- arsp->sequence_data = s_AddLineInfo (arsp->sequence_data,
- data,
- data_line_num,
- data_line_offset);
- ilp = s_IntLinkNew (id_line_num, arsp->id_lines);
- if (arsp->id_lines == NULL) arsp->id_lines = ilp;
- return list;
- }
- /* This function adds data to the Nth sequence in the sequence list and
- * returns eTrue, unless there aren't that many sequences in the list, in
- * which case the function returns eFalse.
- */
- static EBool
- s_AddAlignRawSeqByIndex
- (TAlignRawSeqPtr list,
- int index,
- char * data,
- int data_line_num,
- int data_line_offset)
- {
- TAlignRawSeqPtr arsp;
- int curr;
- curr = 0;
- for (arsp = list; arsp != NULL && curr < index; arsp = arsp->next) {
- curr++;
- }
- if (arsp == NULL) {
- return eFalse;
- } else {
- arsp->sequence_data = s_AddLineInfo (arsp->sequence_data,
- data,
- data_line_num,
- data_line_offset);
- return eTrue;
- }
- }
- /* This function frees memory associated with the SAlignRawFileData structure.
- */
- static void s_AlignFileRawFree (SAlignRawFilePtr afrp)
- {
- if (afrp == NULL) {
- return;
- }
- s_LineInfoFree (afrp->organisms);
- s_LineInfoFree (afrp->deflines);
- s_LineInfoFree (afrp->line_list);
- s_AlignRawSeqFree (afrp->sequences);
- s_IntLinkFree (afrp->offset_list);
- free (afrp);
- }
- /* This function allocates memory for an SAlignRawFileData structure and
- * initializes its member variables. The function returns a pointer to
- * the newly allocated structure.
- */
- static SAlignRawFilePtr s_AlignFileRawNew (void)
- {
- SAlignRawFilePtr afrp;
- afrp = (SAlignRawFilePtr)malloc (sizeof (SAlignRawFileData));
- if (afrp == NULL) {
- return NULL;
- }
- afrp->marked_ids = eFalse;
- afrp->line_list = NULL;
- afrp->organisms = NULL;
- afrp->num_organisms = 0;
- afrp->deflines = NULL;
- afrp->num_deflines = 0;
- afrp->block_size = 0;
- afrp->offset_list = NULL;
- afrp->sequences = NULL;
- afrp->report_error = NULL;
- afrp->report_error_userdata = NULL;
- afrp->alphabet = NULL;
- afrp->expected_num_sequence = 0;
- afrp->expected_sequence_len = 0;
- afrp->num_segments = 1;
- return afrp;
- }
- /* The following functions are used to analyze the structure of a file and
- * assemble the sequences listed in the file.
- * Sequence data in a file is organized in one of two general formats -
- * interleaved or contiguous. Interleaved data can be recognized by looking
- * for repeated blocks of the same number of lines within a file separated
- * by blank or skippable lines from other lines in the file. The first of
- * these blocks must have at least two elements separated by whitespace
- * in each line, the first of these elements is the ID for the sequence in
- * that row and for the sequences in that position within the block for the
- * remainder of the file.
- * Contiguous data can be recognized by either looking for "marked" sequence
- * IDs, which begin with a '>' character, or by looking for repeated patterns
- * of lines with the same numbers of characters.
- */
- /* The following functions are used to analyze interleaved data. */
- /* This function creates a SLengthListData structure that describes the pattern
- * of character lengths in the string pointed to by cp.
- */
- static SLengthListPtr s_GetBlockPattern (char * cp)
- {
- SLengthListPtr this_pattern;
- int len;
- this_pattern = s_LengthListNew (NULL);
- if (this_pattern == NULL) {
- return NULL;
- }
- this_pattern->num_appearances = 1;
- while (*cp != 0) {
- len = strcspn (cp, " tr");
- s_AddLengthRepeat (this_pattern, len);
- cp += len;
- cp += strspn (cp, " tr");
- }
- return this_pattern;
- }
- /* This function attempts to predict whether the following lines will be
- * an interleaved block. If so, the function returns the location of the
- * beginning of the block, otherwise the function returns -1.
- */
- static int
- s_ForecastBlockPattern
- (SLengthListPtr pattern_list,
- TIntLinkPtr next_offset,
- int line_start,
- int block_size)
- {
- int line_counter;
- SLengthListPtr llp;
- line_counter = line_start;
- if (next_offset != NULL
- && next_offset->ival - line_counter < block_size) {
- return -1;
- }
-
- for (llp = pattern_list;
- llp != NULL
- && (next_offset == NULL || line_counter < next_offset->ival - 1)
- && line_counter - line_start < block_size;
- llp = llp->next)
- {
- if (llp->lengthrepeats == NULL) {
- return -1;
- }
- line_counter += llp->num_appearances;
- }
- if (line_counter - line_start == block_size) {
- if (llp->next == NULL) {
- return line_start;
- }
- llp = llp->next;
- if (llp->lengthrepeats == NULL) {
- return line_start;
- }
- }
- return -1;
- }
- /* This function looks for malformed blocks between the identified blocks
- * indicated by the offset_list. It returns a pointer to the list with the
- * new locations inserted at the appropriate locations.
- */
- static TIntLinkPtr
- s_AugmentBlockPatternOffsetList
- (SLengthListPtr pattern_list,
- TIntLinkPtr offset_list,
- int block_size)
- {
- int line_counter;
- SLengthListPtr llp;
- TIntLinkPtr next_offset, prev_offset, new_offset;
- int forecast_pos;
- prev_offset = NULL;
- next_offset = offset_list;
- line_counter = 0;
- llp = pattern_list;
- while (llp != NULL) {
- if (next_offset != NULL && line_counter == next_offset->ival) {
- prev_offset = next_offset;
- next_offset = next_offset->next;
- /* skip past the lines for this block */
- while (line_counter - prev_offset->ival < block_size
- && llp != NULL)
- {
- line_counter += llp->num_appearances;
- llp = llp->next;
- }
- } else {
- forecast_pos = s_ForecastBlockPattern (llp, next_offset,
- line_counter,
- block_size);
- if (forecast_pos > 0) {
- new_offset = s_IntLinkNew (forecast_pos, NULL);
- if (new_offset == NULL) {
- return NULL;
- }
- if (prev_offset == NULL) {
- new_offset->next = offset_list;
- offset_list = new_offset;
- } else {
- new_offset->next = next_offset;
- prev_offset->next = new_offset;
- }
- prev_offset = new_offset;
- /* skip past the lines for this block */
- while (line_counter - prev_offset->ival < block_size
- && llp != NULL)
- {
- line_counter += llp->num_appearances;
- llp = llp->next;
- }
- } else {
- line_counter += llp->num_appearances;
- llp = llp->next;
- }
- }
- }
- return offset_list;
- }
- /* This function looks for lines that could not be assigned to an interleaved
- * block. It returns eTrue if it finds any such lines after the first offset,
- * eFalse otherwise, and reports all instances of unused lines as errors.
- */
- static EBool
- s_FindUnusedLines
- (SLengthListPtr pattern_list,
- SAlignRawFilePtr afrp)
- {
- TIntLinkPtr offset;
- SLengthListPtr llp;
- int line_counter;
- int block_line_counter;
- EBool rval = eFalse;
- TLineInfoPtr line_val;
- int skip;
- if (pattern_list == NULL || afrp == NULL
- || afrp->offset_list == NULL || afrp->block_size < 2) {
- return eFalse;
- }
-
- offset = afrp->offset_list;
- llp = pattern_list;
- line_counter = 0;
- line_val = afrp->line_list;
-
- while (llp != NULL && line_val != NULL) {
- while (llp != NULL && line_val != NULL
- && (offset == NULL || line_counter < offset->ival)) {
- if (llp->lengthrepeats != NULL) {
- s_ReportUnusedLine (line_counter,
- line_counter + llp->num_appearances - 1,
- line_val,
- afrp->report_error,
- afrp->report_error_userdata);
- if (offset != afrp->offset_list) {
- rval = eTrue;
- }
- }
- line_counter += llp->num_appearances;
- for (skip = 0;
- skip < llp->num_appearances && line_val != NULL;
- skip++) {
- line_val = line_val->next;
- }
- llp = llp->next;
- }
- block_line_counter = 0;
- while (block_line_counter < afrp->block_size && llp != NULL) {
- block_line_counter += llp->num_appearances;
- line_counter += llp->num_appearances;
- for (skip = 0;
- skip < llp->num_appearances && line_val != NULL;
- skip++) {
- line_val = line_val->next;
- }
- llp = llp->next;
- }
- if (offset != NULL) {
- offset = offset->next;
- }
- }
- return rval;
- }
- /* This function examines a list of line lengths, looking for interleaved
- * blocks. If it finds them, it will set the SAlignRawFileData offset_list
- * member variable to point to a list of locations for the blocks.
- */
- static void
- s_FindInterleavedBlocks
- (SLengthListPtr pattern_list,
- SAlignRawFilePtr afrp)
- {
- SLengthListPtr llp, llp_next;
- TSizeInfoPtr size_list, best_ptr;
- TIntLinkPtr new_offset;
- int line_counter;
- afrp->block_size = 0;
- size_list = NULL;
- afrp->offset_list = NULL;
- for (llp = pattern_list; llp != NULL; llp = llp->next) {
- llp_next = llp->next;
- if (llp->num_appearances > 1
- && (llp_next == NULL || llp_next->lengthrepeats == NULL)) {
- size_list = s_AddSizeInfo (size_list, llp->num_appearances);
- }
- }
- best_ptr = s_GetMostPopularSizeInfo (size_list);
- if (best_ptr != NULL && best_ptr->num_appearances > 1) {
- afrp->block_size = best_ptr->size_value;
- line_counter = 0;
- for (llp = pattern_list; llp != NULL; llp = llp->next) {
- llp_next = llp->next;
- if (llp->num_appearances == afrp->block_size
- && (llp_next == NULL || llp_next->lengthrepeats == NULL))
- {
- new_offset = s_IntLinkNew (line_counter, afrp->offset_list);
- if (new_offset == NULL) {
- return;
- }
- if (afrp->offset_list == NULL) afrp->offset_list = new_offset;
- }
- line_counter += llp->num_appearances;
- }
- afrp->offset_list = s_AugmentBlockPatternOffsetList (pattern_list,
- afrp->offset_list,
- afrp->block_size);
- }
- if (s_FindUnusedLines (pattern_list, afrp)) {
- s_IntLinkFree (afrp->offset_list);
- afrp->offset_list = NULL;
- afrp->block_size = 0;
- }
- s_SizeInfoFree (size_list);
-
- }
- static void s_TrimEndSpace (char *linestring)
- {
- int len;
- char *cp;
-
- if (linestring == NULL) return;
- len = strlen (linestring);
- cp = linestring + len - 1;
- while (cp > linestring && (*cp == ' ' || *cp == 't' || *cp == 'r' || *cp == 'n'))
- {
- *cp = 0;
- cp--;
- }
- }
- static SAlignRawFilePtr
- s_ReadAlignFileRaw
- (FReadLineFunction readfunc,
- void * userdata,
- TSequenceInfoPtr sequence_info,
- FReportErrorFunction errfunc,
- void * errdata)
- {
- char * linestring;
- SAlignRawFilePtr afrp;
- char * tmp;
- EBool found_stop;
- int overall_line_count;
- EBool found_expected_ntax = eFalse;
- EBool found_expected_nchar = eFalse;
- EBool found_char_comment = eFalse;
- SLengthListPtr pattern_list = NULL;
- SLengthListPtr this_pattern;
- char * cp;
- int len;
- TIntLinkPtr new_offset;
- EBool in_taxa_comment;
- EBool in_bracketed_comment = eFalse;
- TBracketedCommentListPtr comment_list = NULL, last_comment = NULL;
-
- if (readfunc == NULL || sequence_info == NULL) {
- return NULL;
- }
- afrp = s_AlignFileRawNew ();
- if (afrp == NULL) {
- return NULL;
- }
-
- afrp->alphabet = strdup (sequence_info->alphabet);
- afrp->report_error = errfunc;
- afrp->report_error_userdata = errdata;
- overall_line_count = 0;
- found_stop = eFalse;
- in_taxa_comment = eFalse;
- linestring = readfunc (userdata);
- if (s_IsASN1 (linestring)) {
- s_ReportASN1Error (afrp->report_error, afrp->report_error_userdata);
- s_AlignFileRawFree (afrp);
- return NULL;
- }
- while (linestring != NULL && linestring [0] != EOF) {
- s_TrimEndSpace (linestring);
- s_ReadOrgNamesFromText (linestring, overall_line_count, afrp);
- /* we want to remove the comment from the line for the purpose
- * of looking for blank lines and skipping,
- * but save comments for storing in array if line is not skippable or
- * blank
- */
- len = strspn (linestring, " trn");
- tmp = strdup (linestring + len);
- if (tmp == NULL) {
- return NULL;
- }
-
- if (! found_stop && ! in_taxa_comment) {
- found_stop = s_FoundStopLine (tmp);
- }
- if (! found_stop) {
- if (! found_expected_ntax || ! found_expected_nchar) {
- if (s_IsTwoNumbersSeparatedBySpace (tmp)) {
- s_GetFASTAExpectedNumbers (tmp, afrp);
- found_expected_ntax = eTrue;
- found_expected_nchar = eTrue;
- } else {
- s_GetNexusSizeComments (tmp, &found_expected_ntax,
- &found_expected_nchar, afrp);
- }
- }
- if (! found_char_comment) {
- found_char_comment = s_CheckNexusCharInfo (tmp, sequence_info,
- afrp->report_error,
- afrp->report_error_userdata);
- }
-
- if (in_taxa_comment) {
- if (strncmp (tmp, "end;", 4) == 0) {
- in_taxa_comment = eFalse;
- }
- tmp [0] = 0;
- } else if (strncmp (tmp, "begin taxa;", 11) == 0) {
- tmp [0] = 0;
- in_taxa_comment = eTrue;
- }
- /* remove complete single-line bracketed comments from line
- *before checking for multiline bracketed comments */
- s_RemoveCommentFromLine (tmp);
- if (in_bracketed_comment) {
- len = strspn (linestring, " trn");
- if (last_comment != NULL)
- {
- s_BracketedCommentListAddLine (last_comment, linestring + len,
- overall_line_count, len);
- }
- if (strchr (tmp, ']') != NULL) {
- in_bracketed_comment = eFalse;
- }
- tmp [0] = 0;
- } else if (tmp [0] == '[' && strchr (tmp, ']') == NULL) {
- in_bracketed_comment = eTrue;
- len = strspn (linestring, " trn");
- last_comment = s_BracketedCommentListNew (comment_list,
- linestring + len,
- overall_line_count, len);
- if (comment_list == NULL)
- {
- comment_list = last_comment;
- }
- tmp [0] = 0;
- }
- if (s_SkippableString (tmp)) {
- tmp [0] = 0;
- }
-
- if (tmp [0] == '>' && ! found_stop) {
- afrp->marked_ids = eTrue;
- new_offset = s_IntLinkNew (overall_line_count + 1,
- afrp->offset_list);
- if (afrp->offset_list == NULL) afrp->offset_list = new_offset;
- }
- if (! afrp->marked_ids) {
- /* add to length list for interleaved block search */
- len = strcspn (tmp, " tr");
- if (len > 0) {
- cp = tmp + len;
- len = strspn (cp, " tr");
- if (len > 0) {
- cp = cp + len;
- }
- if (*cp == 0) {
- this_pattern = s_GetBlockPattern (tmp);
- } else {
- this_pattern = s_GetBlockPattern (cp);
- }
- pattern_list = s_AddPatternRepeat (pattern_list,
- this_pattern);
- } else {
- this_pattern = s_GetBlockPattern (tmp);
- pattern_list = s_AddPatternRepeat (pattern_list,
- this_pattern);
- }
- }
- len = strspn (linestring, " trn");
- afrp->line_list = s_AddLineInfo (afrp->line_list,
- linestring + len,
- overall_line_count, len);
- }
- free (linestring);
- free (tmp);
- linestring = readfunc (userdata);
- overall_line_count ++;
- }
- afrp->num_segments = s_GetNumSegmentsInAlignment (comment_list, errfunc, errdata);
- if (afrp->num_segments > 1)
- {
- if (afrp->offset_list != NULL)
- {
- s_ReportSegmentedAlignmentError (afrp->offset_list,
- errfunc, errdata);
- s_AlignFileRawFree (afrp);
- s_LengthListFree (pattern_list);
- s_BracketedCommentListFree (comment_list);
- return NULL;
- }
- else
- {
- afrp->offset_list = GetSegmentOffsetList (comment_list);
- afrp->marked_ids = eTrue;
- }
- }
- if (! afrp->marked_ids) {
- s_FindInterleavedBlocks (pattern_list, afrp);
- }
- s_LengthListFree (pattern_list);
- s_BracketedCommentListFree (comment_list);
- return afrp;
- }
- /* This function analyzes a block to see if it contains, as the first element
- * of any of its lines, one of the sequence IDs already identified. If the
- * one of the lines does begin with a sequence ID, all of the lines are
- * assumed to begin with sequence IDs and the function returns eTrue, otherwise
- * the function returns eFalse.
- */
- static EBool
- s_DoesBlockHaveIds
- (SAlignRawFilePtr afrp,
- TLineInfoPtr first_line,
- int num_lines_in_block)
- {
- TLineInfoPtr lip;
- char * linestring;
- char * this_id;
- TAlignRawSeqPtr arsp;
- size_t len;
- int block_offset;
- if (afrp->sequences == NULL) {
- return eTrue;
- }
- for (lip = first_line, block_offset = 0;
- lip != NULL && block_offset < num_lines_in_block;
- lip = lip->next, block_offset++)
- {
- linestring = lip->data;
- if (linestring != NULL) {
- len = strcspn (linestring, " tr");
- if (len > 0 && len < strlen (linestring)) {
- this_id = (char *) malloc (len + 1);
- if (this_id == NULL) {
- return eFalse;
- }
- strncpy (this_id, linestring, len);
- this_id [len] = 0;
- arsp = s_FindAlignRawSeqById (afrp->sequences, this_id);
- free (this_id);
- if (arsp != NULL) {
- return eTrue;
- }
- }
- }
- }
- return eFalse;
- }
- /* This function analyzes the lines of the block to see if the pattern of
- * the lengths of the whitespace-separated pieces of sequence data matches
- * for all lines within the block. The function returns eTrue if this is so,
- * otherwise the function returns eFalse.
- */
- static EBool
- s_BlockIsConsistent
- (SAlignRawFilePtr afrp,
- TLineInfoPtr first_line,
- int num_lines_in_block,
- EBool has_ids,
- EBool first_block)
- {
- TLineInfoPtr lip;
- SLengthListPtr list, this_pattern, best;
- int len, block_offset, id_offset;
- char * tmp_id;
- EBool rval;
- char * cp;
- rval = eTrue;
- list = NULL;
- for (lip = first_line, block_offset = 0;
- lip != NULL && block_offset < num_lines_in_block;
- lip = lip->next, block_offset ++)
- {
- cp = lip->data;
- if (has_ids) {
- len = strcspn (cp, " tr");
- tmp_id = (char *) malloc ( (len + 1) * sizeof (char));
- if (tmp_id == NULL) {
- return eFalse;
- }
- strncpy (tmp_id, cp, len);
- tmp_id [len] = 0;
- id_offset = s_FindAlignRawSeqOffsetById (afrp->sequences, tmp_id);
- if (id_offset != block_offset && ! first_block) {
- rval = eFalse;
- s_ReportInconsistentID (tmp_id, lip->line_num,
- afrp->report_error,
- afrp->report_error_userdata);
- }
- free (tmp_id);
- cp += len;
- cp += strspn (cp, " tr");
- }
- this_pattern = s_GetBlockPattern (cp);
- list = s_AddLengthList (list, this_pattern);
- }
- /* Now find the pattern with the most appearances */
- best = NULL;
- for (this_pattern = list;
- this_pattern != NULL;
- this_pattern = this_pattern->next)
- {
- if (this_pattern->num_appearances == 0) continue;
- if (best == NULL
- || this_pattern->num_appearances > best->num_appearances)
- {
- best = this_pattern;
- }
- }
- /* now identify and report inconsistent lines */
- for (lip = first_line, block_offset = 0;
- lip != NULL && block_offset < num_lines_in_block;
- lip = lip->next, block_offset ++)
- {
- cp = lip->data;
- if (has_ids) {
- len = strcspn (cp, " tr");
- tmp_id = (char *) malloc ( (len + 1) * sizeof (char));
- if (tmp_id == NULL) {
- return eFalse;
- }
- strncpy (tmp_id, cp, len);
- tmp_id [len] = 0;
- cp += len;
- cp += strspn (cp, " tr");
- } else {
- tmp_id = s_GetAlignRawSeqIDByOffset (afrp->sequences, block_offset);
- }
- this_pattern = s_GetBlockPattern (cp);
- if ( ! s_DoLengthPatternsMatch (this_pattern, best)) {
- rval = eFalse;
- s_ReportInconsistentBlockLine (tmp_id, lip->line_num,
- afrp->report_error,
- afrp->report_error_userdata);
- }
- s_LengthListFree (this_pattern);
- if (has_ids) {
- free (tmp_id);
- }
- }
- s_LengthListFree (list);
- return rval;
- }
- /* This function processes a block of lines and adds the sequence data from
- * each line in the block to the appropriate sequence in the list.
- */
- static void
- s_ProcessBlockLines
- (SAlignRawFilePtr afrp,
- TLineInfoPtr lines,
- int num_lines_in_block,
- EBool first_block)
- {
- TLineInfoPtr lip;
- char * linestring;
- char * cp;
- char * this_id;
- int len;
- int line_number;
- EBool this_block_has_ids;
- int pos;
- this_block_has_ids = s_DoesBlockHaveIds (afrp, lines, num_lines_in_block);
- s_BlockIsConsistent (afrp, lines, num_lines_in_block, this_block_has_ids,
- first_block);
- for (lip = lines, line_number = 0;
- lip != NULL && line_number < num_lines_in_block;
- lip = lip->next, line_number ++)
- {
- linestring = lip->data;
- if (linestring != NULL) {
- pos = 0;
- if (this_block_has_ids) {
- len = strcspn (linestring, " tr");
- this_id = (char *) malloc (len + 1);
- if (this_id == NULL) {
- return;
- }
- strncpy (this_id, linestring, len);
- this_id [len] = 0;
- cp = linestring + len;
- pos += len;
- len = strspn (linestring, " tr");
- cp += len;
- pos += len;
- afrp->sequences = s_AddAlignRawSeqById (afrp->sequences,
- this_id, cp,
- lip->line_num,
- lip->line_num,
- lip->line_offset + cp - linestring);
- free (this_id);
- } else {
- if (! s_AddAlignRawSeqByIndex (afrp->sequences, line_number,
- linestring,
- lip->line_num, lip->line_offset))
- {
- s_ReportBlockLengthError ("", lip->line_num,
- afrp->block_size,
- line_number,
- afrp->report_error,
- afrp->report_error_userdata);
- }
- }
- }
- }
- }
- /* This function removes comments from the lines of an interleaved block of
- * data.
- */
- static void
- s_RemoveCommentsFromBlock
- (TLineInfoPtr first_line,
- int num_lines_in_block)
- {
- TLineInfoPtr lip;
- int block_offset;
- for (lip = first_line, block_offset = 0;
- lip != NULL && block_offset < num_lines_in_block;
- lip = lip->next)
- {
- s_RemoveCommentFromLine (lip->data);
- }
- }
- /* This function processes the interleaved block of data found at each
- * location listed in afrp->offset_list.
- */
- static void s_ProcessAlignRawFileByBlockOffsets (SAlignRawFilePtr afrp)
- {
- int line_counter;
- TIntLinkPtr offset_ptr;
- TLineInfoPtr lip;
- EBool first_block = eTrue;
- EBool in_taxa_comment = eFalse;
-
- if (afrp == NULL) {
- return;
- }
-
- line_counter = 0;
- offset_ptr = afrp->offset_list;
- lip = afrp->line_list;
- while (lip != NULL && offset_ptr != NULL
- && (in_taxa_comment || ! s_FoundStopLine (lip->data))) {
- if (in_taxa_comment) {
- if (strncmp (lip->data, "end;", 4) == 0) {
- in_taxa_comment = eFalse;
- }
- } else if (lip->data != NULL
- && strncmp (lip->data, "begin taxa;", 11) == 0) {
- in_taxa_comment = eTrue;
- }
- if (line_counter == offset_ptr->ival) {
- s_RemoveCommentsFromBlock (lip, afrp->block_size);
- s_ProcessBlockLines (afrp, lip, afrp->block_size, first_block);
- first_block = eFalse;
- offset_ptr = offset_ptr->next;
- }
- lip = lip->next;
- line_counter ++;
- }
- }
- /* The following functions are used to analyze contiguous data. */
- static void
- s_CreateSequencesBasedOnTokenPatterns
- (TLineInfoPtr token_list,
- TIntLinkPtr offset_list,
- SLengthListPtr * anchorpattern,
- SAlignRawFilePtr afrp)
- {
- TLineInfoPtr lip;
- int line_counter;
- TIntLinkPtr offset_ptr, next_offset_ptr;
- char * curr_id;
- TSizeInfoPtr sip;
- int pattern_line_counter;
- int curr_seg;
- if (token_list == NULL || offset_list == NULL
- || anchorpattern == NULL
- || afrp == NULL)
- {
- return;
- }
- for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
- {
- if (anchorpattern [curr_seg] == NULL || anchorpattern [curr_seg]->lengthrepeats == NULL)
- {
- return;
- }
- }
- line_counter = 0;
- lip = token_list;
- offset_ptr = offset_list;
- curr_seg = 0;
-
- for (offset_ptr = offset_list;
- offset_ptr != NULL && lip != NULL;
- offset_ptr = offset_ptr->next)
- {
- next_offset_ptr = offset_ptr->next;
- while (line_counter < offset_ptr->ival - 1 && lip != NULL) {
- lip = lip->next;
- line_counter ++;
- }
- if (lip != NULL) {
- curr_id = lip->data;
- lip = lip->next;
- line_counter ++;
- for (sip = anchorpattern[curr_seg]->lengthrepeats;
- sip != NULL
- && lip != NULL
- && (next_offset_ptr == NULL
- || line_counter < next_offset_ptr->ival - 1);
- sip = sip->next)
- {
- for (pattern_line_counter = 0;
- pattern_line_counter < sip->num_appearances
- && lip != NULL
- && (next_offset_ptr == NULL
- || line_counter < next_offset_ptr->ival - 1);
- pattern_line_counter ++)
- {
- if ((int) strlen (lip->data) != sip->size_value) {
- s_ReportLineLengthError (curr_id, lip, sip->size_value,
- afrp->report_error,
- afrp->report_error_userdata);
- }
- afrp->sequences = s_AddAlignRawSeqById (afrp->sequences,
- curr_id,
- lip->data,
- lip->line_num,
- lip->line_num,
- lip->line_offset);
- lip = lip->next;
- line_counter ++;
- }
- }
- if (sip != NULL && lip != NULL) {
- s_ReportBlockLengthError (curr_id, lip->line_num,
- afrp->block_size,
- line_counter - offset_ptr->ival,
- afrp->report_error,
- afrp->report_error_userdata);
- }
- }
- curr_seg ++;
- if (curr_seg >= afrp->num_segments)
- {
- curr_seg = 0;
- }
- }
- }
- /* The following functions are used for analyzing contiguous data with
- * marked IDs.
- */
- /* This function creates a new LengthList pattern for each marked ID.
- * After each new list is created, the function checks to see if the
- * new pattern matches any pattern already in the list of patterns seen.
- * If so, the function deletes the new pattern and increments
- * num_appearances for the pattern in the list, otherwise the function
- * adds the new pattern to the list.
- * When the list is complete, the function finds the pattern with the
- * most appearances and returns that pattern as the anchor pattern to use
- * when checking sequence data blocks for consistency with one another.
- */
- static SLengthListPtr *
- s_CreateAnchorPatternForMarkedIDs
- (SAlignRawFilePtr afrp)
- {
- SLengthListPtr * list;
- SLengthListPtr * best;
- SLengthListPtr this_pattern;
- char * cp;
- TLineInfoPtr lip;
- int curr_seg;
- if (afrp == NULL) {
- return NULL;
- }
- /* initialize length lists */
- list = (SLengthListPtr *) malloc (afrp->num_segments * sizeof (SLengthListPtr));
- if (list == NULL)
- {
- return NULL;
- }
- for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
- {
- list[curr_seg] = NULL;
- }
- /* initialize best ptrs */
- /* list is one element longer, to hold null terminator */
- best = (SLengthListPtr *) malloc ((afrp->num_segments + 1) * sizeof (SLengthListPtr));
- if (best == NULL)
- {
- return NULL;
- }
- for (curr_seg = 0; curr_seg < afrp->num_segments + 1; curr_seg ++)
- {
- best[curr_seg] = NULL;
- }
-
- /* initialize pattern */
- this_pattern = NULL;
- curr_seg = 0;
- for (lip = afrp->line_list;
- lip != NULL && ! s_FoundStopLine (lip->data);
- lip = lip->next)
- {
- if (lip->data == NULL) continue;
- if (lip->data [0] == ']' || lip->data [0] == '[') continue;
- if (lip->data [0] == '>') {
- if (this_pattern != NULL) {
- list [curr_seg] = s_AddLengthList (list [curr_seg], this_pattern);
- curr_seg ++;
- if (curr_seg >= afrp->num_segments)
- {
- curr_seg = 0;
- }
- }
- this_pattern = s_LengthListNew (NULL);
- if (this_pattern == NULL) {
- for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
- {
- s_LengthListFree (list [curr_seg]);
- }
- free (list);
- return NULL;
- }
- this_pattern->num_appearances = 1;
- } else if (this_pattern != NULL) {
- /* This section gets rid of base pair number comments */
- cp = lip->data;
- while ( isspace ((int )*cp) || isdigit ((int )*cp)) {
- cp++;
- }
- s_AddLengthRepeat (this_pattern, strlen (cp));
- }
- }
- if (this_pattern != NULL) {
- list[curr_seg] = s_AddLengthList (list [curr_seg], this_pattern);
- }
- /* Now find the pattern with the most appearances for each segment*/
- for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg++)
- {
- for (this_pattern = list [curr_seg];
- this_pattern != NULL;
- this_pattern = this_pattern->next)
- {
- if (this_pattern->num_appearances == 0) continue;
- if (best [curr_seg] == NULL
- || this_pattern->num_appearances > best[curr_seg]->num_appearances)
- {
- best[curr_seg] = this_pattern;
- }
-
- }
- /* free all patterns before and after anchor pattern */
- if (best [curr_seg] != NULL) {
- s_LengthListFree (best [curr_seg]->next);
- best [curr_seg]->next = NULL;
- }
- if (best [curr_seg] != list [curr_seg]) {
- this_pattern = list [curr_seg];
- while ( this_pattern != NULL && this_pattern->next != best[curr_seg] ) {
- this_pattern = this_pattern->next;
- }
- if (this_pattern != NULL) {
- this_pattern->next = NULL;
- s_LengthListFree (list [curr_seg]);
- }
- }
- }
- for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
- {
- if (best[curr_seg] == NULL)
- {
- for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
- {
- s_LengthListFree (best [curr_seg]);
- }
- return NULL;
- }
- }
-
- return best;
- }
- /* This function removes base pair count comments from the data sections
- * for contiguous marked ID sequences.
- */
- static void s_RemoveBasePairCountCommentsFromData (SAlignRawFilePtr afrp)
- {
- TIntLinkPtr this_offset, next_offset;
- TLineInfoPtr lip;
- int line_count;
- char * cp;
- if (afrp == NULL || afrp->offset_list == NULL) {
- return;
- }
- this_offset = afrp->offset_list;
- next_offset = this_offset->next;
- lip = afrp->line_list;
- line_count = 0;
- while (lip != NULL && this_offset != NULL) {
- if (line_count == this_offset->ival) {
- while (lip != NULL &&
- (next_offset == NULL
- || line_count < next_offset->ival - 1)) {
- cp = lip->data;
- if (cp != NULL) {
- cp += strspn (cp, " trn1234567890");
- if (cp != lip->data) {
- strcpy (lip->data, cp);
- }
- }
- line_count ++;
- lip = lip->next;
- }
- this_offset = this_offset->next;
- if (this_offset != NULL) {
- next_offset = this_offset->next;
- }
- } else {
- line_count ++;
- lip = lip->next;
- }
- }
- }
-
- /* This function assumes that the offset_list has already been populated
- * with the locations of the data blocks. It analyzes the blocks of data
- * to find the most frequently occurring pattern of lengths of data and then
- * uses that pattern to attach the data to the correct IDs and report any
- * errors in formatting.
- */
- static void s_ProcessAlignFileRawForMarkedIDs (SAlignRawFilePtr afrp)
- {
- SLengthListPtr * anchorpattern;
- if (afrp == NULL) {
- return;
- }
- s_RemoveBasePairCountCommentsFromData (afrp);
- anchorpattern = s_CreateAnchorPatternForMarkedIDs (afrp);
- if (anchorpattern == NULL || afrp->offset_list == NULL) {
- return;
- }
- s_CreateSequencesBasedOnTokenPatterns (afrp->line_list, afrp->offset_list,
- anchorpattern, afrp);
- }
- /* The following functions are used for analyzing contiguous sequence data
- * without marked IDs.
- */
- /* This function left-shifts a string, character by character. */
- static void
- s_StringLeftShift
- (char * cp_from,
- char * cp_to)
- {
- if (cp_from == cp_to || cp_from == NULL || cp_to == NULL) {
- return;
- }
- while (*cp_to != 0) {
- *cp_from = *cp_to;
- cp_from++;
- cp_to++;
- }
- *cp_from = 0;
- }
- /* This function removes bracketed comments from a linked list of
- * SLineInfo structures. The function returns a pointer to the
- * list without the comments.
- */
- static TLineInfoPtr s_RemoveCommentsFromTokens (TLineInfoPtr list)
- {
- TLineInfoPtr lip;
- int num_comment_starts;
- char * cp_l;
- char * cp_r;
- char * cp;
- EBool in_comment;
- num_comment_starts = 0;
- in_comment = eFalse;
- for (lip = list; lip != NULL; lip = lip->next) {
- if (lip->data == NULL) {
- lip->delete_me = eTrue;
- } else {
- cp_l = NULL;
- cp_r = NULL;
- for (cp = lip->data; *cp != 0; cp++) {
- if (*cp == ']') {
- if (cp_r == NULL) {
- s_StringLeftShift (lip->data, cp + 1);
- cp = lip->data - 1;
- } else {
- s_StringLeftShift (cp_r, cp + 1);
- cp = cp_r;
- if (cp_r > lip->data) {
- cp_r --;
- while (cp_r >= lip->data && *cp_r != '[') {
- cp_r --;
- }
- if (cp_r < lip->data) {
- cp_r = NULL;
- }
- } else {
- cp_r = NULL;
- }
- }
- if (num_comment_starts > 0) {
- num_comment_starts --;
- }
- } else if (*cp == '[') {
- cp_r = cp;
- num_comment_starts ++;
- }
- }
- if (in_comment) {
- if (num_comment_starts == 0) {
- in_comment = eFalse;
- } else {
- lip->delete_me = eTrue;
- }
- } else if (num_comment_starts > 0) {
- cp_r = strchr (lip->data, '[');
- if (cp_r != NULL) {
- *cp_r = 0;
- }
- in_comment = eTrue;
- }
- if (lip->data [0] == 0) {
- lip->delete_me = eTrue;
- }
- }
- }
- list = s_DeleteLineInfos (list);
- return list;
- }
- /* This function removes Nexus comments from a linked list of SLineInfo
- * structures. The function returns a pointer to the list without the
- * comments.
- */
- static TLineInfoPtr s_RemoveNexusCommentsFromTokens (TLineInfoPtr list)
- {
- TLineInfoPtr lip, start_lip, end_lip;
- lip = list;
- start_lip = NULL;
- end_lip = NULL;
- while (lip != NULL) {
- if (s_StringICmp (lip->data, "#NEXUS") == 0) {
- start_lip = lip;
- end_lip = lip;
- while (end_lip != NULL
- && s_StringICmp (end_lip->data, "matrix") != 0) {
- end_lip = end_lip->next;
- }
- if (end_lip != NULL) {
- while (start_lip != end_lip) {
- start_lip->delete_me = eTrue;
- start_lip = start_lip->next;
- }
- end_lip->delete_me = eTrue;
- lip = end_lip->next;
- } else {
- lip = lip->next;
- }
- } else {
- lip = lip->next;
- }
- }
- list = s_DeleteLineInfos (list);
- return list;
- }
- /* This function finds the number of characters that occur most frequently
- * in a token and returns a pointer to a SSizeInfo structure that
- * describes the most common length and the number of times it appears.
- */
- static TSizeInfoPtr
- s_FindMostFrequentlyOccurringTokenLength
- (TSizeInfoPtr list,
- int not_this_size)
- {
- TSizeInfoPtr list_ptr, new_list, best_ptr, return_best;
- new_list = NULL;
- for (list_ptr = list; list_ptr != NULL; list_ptr = list_ptr->next) {
- if (not_this_size != list_ptr->size_value) {
- new_list = s_AddSizeInfoAppearances (new_list,
- list_ptr->size_value,
- list_ptr->num_appearances);
- }
- }
- best_ptr = s_GetMostPopularSizeInfo (new_list);
- return_best = NULL;
- if (best_ptr != NULL) {
- return_best = s_SizeInfoNew (NULL);
- if (return_best != NULL) {
- return_best->size_value = best_ptr->size_value;
- return_best->num_appearances = best_ptr->num_appearances;
- }
- }
- s_SizeInfoFree (new_list);
- return return_best;
- }
- /* This function examines all instances of an anchor pattern in the data
- * and checks to see if the line immediately after the anchor pattern should
- * be used as part of the anchor pattern. This function exists because
- * frequently, but not always, contiguous data will consist of multiple lines
- * of data of the same length (for example, 80 characters), followed by one
- * shorter line with the remaining data. We must also make sure that we do
- * not accidentally include the ID of the next sequence in the data of the
- * previous sequence.
- */
- static void
- s_ExtendAnchorPattern
- (SLengthListPtr anchorpattern,
- TSizeInfoPtr line_lengths)
- {
- TSizeInfoPtr last_line_lengths, sip, sip_next, twoafter;
- int best_last_line_length;
- int anchor_line_length;
- if (anchorpattern == NULL
- || anchorpattern->lengthrepeats == NULL
- || line_lengths == NULL) {
- return;
- }
- last_line_lengths = NULL;
- anchor_line_length = anchorpattern->lengthrepeats->size_value;
- /* also check to make sure that there's more than one line between
- * this pattern and the next pattern, otherwise the next line is the
- * ID for the next pattern and shouldn't be included in the anchor
- */
- for (sip = line_lengths; sip != NULL; sip = sip->next) {
- if (s_SizeInfoIsEqual (sip, anchorpattern->lengthrepeats)) {
- sip_next = sip->next;
- if (sip_next != NULL
- && sip_next->size_value > 0
- && sip_next->size_value != anchor_line_length
- && ((twoafter = sip_next->next) == NULL
- || twoafter->size_value != anchor_line_length))
- {
- last_line_lengths = s_AddSizeInfo (last_line_lengths,
- sip_next->size_value);
- }
- }
- }
- best_last_line_length = s_GetMostPopularSize (last_line_lengths);
- if (best_last_line_length > 0) {
- s_AddLengthRepeat (anchorpattern, best_last_line_length);
- }
- s_SizeInfoFree (last_line_lengths);
- }
- /* This function looks for the most frequently occurring pattern, where a
- * pattern is considered to be N contiguous tokens of M characters. The
- * function then checks to see if there is usually a token of a particular
- * length that immediately follows this pattern that is not the ID for the
- * next sequence. If so, this line length is added to the pattern.
- * The function returns a pointer to this pattern.
- */
- static SLengthListPtr s_FindMostPopularPattern (TSizeInfoPtr list)
- {
- SLengthListPtr patternlist, newpattern;
- TSizeInfoPtr sip, popular_line_length;
- SLengthListPtr index, best;
- int not_this_length;
- patternlist = NULL;
- for (sip = list; sip != NULL; sip = sip->next) {
- if (sip->size_value > 0) {
- newpattern = s_LengthListNew (NULL);
- if (newpattern == NULL) {
- s_LengthListFree (patternlist);
- return NULL;
- }
- newpattern->num_appearances = 1;
- newpattern->lengthrepeats = s_SizeInfoNew (NULL);
- if (newpattern->lengthrepeats == NULL) {
- s_LengthListFree (patternlist);
- return NULL;
- }
- newpattern->lengthrepeats->size_value = sip->size_value;
- newpattern->lengthrepeats->num_appearances = sip->num_appearances;
- patternlist = s_AddLengthList (patternlist, newpattern);
- }
- }
- if (patternlist == NULL) {
- return NULL;
- }
- best = NULL;
- for (index = patternlist; index != NULL; index = index->next) {
- if (index->lengthrepeats->num_appearances < 2) {
- continue;
- }
- if (best==NULL || best->num_appearances < index->num_appearances) {
- best = index;
- } else if (best->num_appearances == index->num_appearances
- && best->lengthrepeats->size_value <
- index->lengthrepeats->size_value) {
- best = index;
- }
- }
- /* Free data in list before best pattern */
- index = patternlist;
- while ( index != NULL && index->next != best ) {
- index = index->next;
- }
- if (index != NULL) {
- index->next = NULL;
- s_LengthListFree (patternlist);
- }
- /* Free data in list after best pattern */
- if (best != NULL) {
- s_LengthListFree (best->next);
- best->next = NULL;
- }
- popular_line_length = s_FindMostFrequentlyOccurringTokenLength (list, 0);
- if (best != NULL && best->lengthrepeats != NULL
- && popular_line_length != NULL
- && best->lengthrepeats->size_value == popular_line_length->size_value)
- {
- not_this_length = popular_line_length->size_value;
- s_SizeInfoFree (popular_line_length);
- popular_line_length = s_FindMostFrequentlyOccurringTokenLength (list,
- not_this_length);
- }
- if (best == NULL
- || (popular_line_length != NULL
- && popular_line_length->size_value > best->lengthrepeats->size_value
- && popular_line_length->num_appearances > best->num_appearances))
- {
- if (best == NULL) {
- best = s_LengthListNew (NULL);
- if (best == NULL) {
- return NULL;
- }
- }
- best->lengthrepeats = s_SizeInfoNew (NULL);
- if (best->lengthrepeats == NULL) {
- return NULL;
- }
- best->lengthrepeats->size_value = popular_line_length->size_value;
- best->lengthrepeats->num_appearances = 1;
- } else {
- /* extend anchor pattern to include best length of last line */
- s_ExtendAnchorPattern (best, list);
- }
- s_SizeInfoFree (popular_line_length);
- return best;
- }
- /* This function creates an SIntLink list to describe the locations
- * of occurrences of the anchorpattern in the SSizeInfo list.
- * The function returns a pointer to the SIntLink list.
- */
- static TIntLinkPtr
- s_CreateOffsetList
- (TSizeInfoPtr list,
- SLengthListPtr anchorpattern)
- {
- int line_counter;
- TIntLinkPtr offset_list, new_offset;
- TSizeInfoPtr sip, prev_sip;
- if (list == NULL || anchorpattern == NULL) {
- return NULL;
- }
- line_counter = 0;
- offset_list = NULL;
- prev_sip = NULL;
- for (sip = list; sip != NULL; sip = sip->next) {
- if (s_SizeInfoIsEqual (sip, anchorpattern->lengthrepeats)) {
- new_offset = s_IntLinkNew (line_counter, offset_list);
- if (new_offset == NULL) {
- s_IntLinkFree (offset_list);
- return NULL;
- }
- if (offset_list == NULL) {
- offset_list = new_offset;
- }
- }
-
- line_counter += sip->num_appearances;
- prev_sip = sip;
- }
- return offset_list;
- }
- /* This function determines whether or not the number of expected sequence
- * characters are available starting at a token after line_start and stopping
- * at least one token before the next known sequence data block in the list.
- * If so, the function returns the number of the token at which the sequence
- * data begins. Otherwise the function returns -1.
- */
- static int
- s_ForecastPattern
- (int line_start,
- int pattern_length,
- TIntLinkPtr next_offset,
- int sip_offset,
- TSizeInfoPtr list)
- {
- int offset, end_offset;
- TSizeInfoPtr sip;
- int line_counter, num_chars;
-
- if (list == NULL) {
- return -1;
- }
- for (offset = sip_offset; offset < list->num_appearances; offset++) {
- line_counter = line_start + offset;
- num_chars = list->size_value * (list->num_appearances - offset);
- sip = list;
- while (num_chars < pattern_length
- && (next_offset == NULL || line_counter < next_offset->ival)
- && sip->next != NULL)
- {
- sip = sip->next;
- for (end_offset = 0;
- end_offset < sip->num_appearances
- && num_chars < pattern_length
- && (next_offset == NULL
- || line_counter < next_offset->ival);
- end_offset ++)
- {
- num_chars += sip->size_value;
- line_counter ++;
- }
- }
- if (num_chars == pattern_length) {
- return line_start + offset;
- }
- }
- return -1;
- }
- /* This function examines the offset list and searches for holes where blocks
- * of sequence data without the exact expected formatting might exist. The
- * function adds the offsets of any new blocks to the list and returns a
- * pointer to the augmented offset list.
- */
- static TIntLinkPtr
- s_AugmentOffsetList
- (TIntLinkPtr offset_list,
- TSizeInfoPtr list,
- SLengthListPtr anchorpattern)
- {
- int pattern_length;
- TSizeInfoPtr sip;
- TIntLinkPtr prev_offset, next_offset, new_offset;
- int line_counter, forecast_position, line_skip;
- EBool skipped_previous = eFalse;
- int num_chars;
- if (list == NULL || anchorpattern == NULL) {
- return offset_list;
- }
- pattern_length = 0;
- for (sip = anchorpattern->lengthrepeats; sip != NULL; sip = sip->next) {
- pattern_length += (sip->size_value * sip->num_appearances);
- }
- if (pattern_length == 0) {
- return offset_list;
- }
- prev_offset = NULL;
- next_offset = offset_list;
- line_counter = 0;
- sip = list;
- while (sip != NULL) {
- /* if we are somehow out of synch, don't get caught in infinite loop */
- if (next_offset != NULL && line_counter > next_offset->ival) {
- next_offset = next_offset->next;
- } else if (next_offset != NULL && line_counter == next_offset->ival) {
- skipped_previous = eFalse;
- prev_offset = next_offset;
- next_offset = next_offset->next;
- /* advance sip and line counter past the end of this pattern */
- num_chars = 0;
- while (num_chars < pattern_length && sip != NULL) {
- num_chars += sip->size_value * sip->num_appearances;
- line_counter += sip->num_appearances;
- sip = sip->next;
- }
- } else if (skipped_previous) {
- line_skip = 0;
- while (sip != NULL && line_skip < sip->num_appearances
- && (next_offset == NULL
- || line_counter < next_offset->ival)) {
- /* see if we can build a pattern that matches the pattern
- * length we want
- */
- forecast_position = s_ForecastPattern (line_counter,
- pattern_length,
- next_offset, line_skip,
- sip);
- if (forecast_position > 0) {
- new_offset = s_IntLinkNew (forecast_position, NULL);
- if (new_offset == NULL) {
- return NULL;
- }
- if (prev_offset == NULL) {
- new_offset->next = offset_list;
- offset_list = new_offset;
- } else {
- new_offset->next = next_offset;
- prev_offset->next = new_offset;
- }
- prev_offset = new_offset;
- /* now advance sip and line counter past the end
- * of the pattern we have just created
- */
- num_chars = 0;
- while (num_chars < pattern_length && sip != NULL) {
- for (line_skip = 0;
- line_skip < sip->num_appearances
- && num_chars < pattern_length;
- line_skip++)
- {
- num_chars += sip->size_value;
- line_counter ++;
- }
- if (line_skip == sip->num_appearances) {
- sip = sip->next;
- line_skip = 0;
- }
- }
- } else {
- line_counter += sip->num_appearances;
- sip = sip->next;
- line_skip = 0;
- }
- }
- } else {
- skipped_previous = eTrue;
- line_counter += sip->num_appearances;
- sip = sip->next;
- }
- }
- return offset_list;
- }
- /* This function finds the most frequently occurring distance between
- * two sequence data blocks and returns that value.
- */
- static int s_GetMostPopularPatternLength (TIntLinkPtr offset_list)
- {
- int line_counter, best_length;
- TSizeInfoPtr pattern_length_list;
- TIntLinkPtr offset;
- if (offset_list == NULL) {
- return -1;
- }
- line_counter = -1;
- pattern_length_list = NULL;
- for (offset = offset_list; offset != NULL; offset = offset->next) {
- if (line_counter != -1) {
- pattern_length_list = s_AddSizeInfo (pattern_length_list,
- offset->ival - line_counter);
- }
- line_counter = offset->ival;
- }
- best_length = s_GetMostPopularSize (pattern_length_list);
- s_SizeInfoFree (pattern_length_list);
- return best_length;
- }
- /* This function finds the most frequently appearing number of characters
- * in a block of sequence data and returns that value.
- */
- static int
- s_GetBestCharacterLength
- (TLineInfoPtr token_list,
- TIntLinkPtr offset_list,
- int block_length)
- {
- TLineInfoPtr lip;
- TIntLinkPtr prev_offset, new_offset;
- int line_diff, num_chars, best_num_chars;
- TSizeInfoPtr pattern_length_list = NULL;
- if (token_list == NULL || offset_list == NULL || block_length < 1) {
- return -1;
- }
- /* get length of well-formatted block size */
- lip = token_list;
- prev_offset = NULL;
- for (new_offset = offset_list;
- new_offset != NULL && lip != NULL;
- new_offset = new_offset->next)
- {
- if (prev_offset == NULL) {
- /* skip first tokens */
- for (line_diff = 0;
- line_diff < new_offset->ival && lip != NULL;
- line_diff ++)
- {
- lip = lip->next;
- }
- }
- if (prev_offset != NULL) {
- num_chars = 0;
- for (line_diff = 0;
- line_diff < new_offset->ival - prev_offset->ival
- && lip != NULL;
- line_diff ++)
- {
- if (line_diff < new_offset->ival - prev_offset->ival - 1) {
- num_chars += strlen (lip->data);
- }
- lip = lip->next;
- }
- if (new_offset->ival - prev_offset->ival == block_length) {
- pattern_length_list = s_AddSizeInfo (pattern_length_list,
- num_chars);
- }
- }
- prev_offset = new_offset;
- }
- best_num_chars = s_GetMostPopularSize (pattern_length_list);
- if (best_num_chars == 0 && pattern_length_list != NULL) {
- best_num_chars = pattern_length_list->size_value;
- }
- s_SizeInfoFree (pattern_length_list);
- pattern_length_list = NULL;
- return best_num_chars;
- }
- static int
- s_CountCharactersBetweenOffsets
- (TLineInfoPtr list,
- int distance,
- int desired_num_chars)
- {
- int line_diff, num_chars, total_chars, pattern_length, num_starts;
- TLineInfoPtr lip;
- TIntLinkPtr length_list, start_list, start_ptr, length;
- int start_of_unknown;
- int num_additional_offsets_needed;
- if (list == NULL || distance == 0 || desired_num_chars == 0) {
- return 0;
- }
- /* because the first offset is the start of a known pattern, we should
- * skip to the end of that pattern and start looking for additional
- * offsets
- */
- total_chars = 0;
- for (lip = list, line_diff = 0;
- lip != NULL && line_diff < distance
- && total_chars < desired_num_chars;
- lip = lip->next, line_diff++) {
- num_chars = strlen (lip->data);
- total_chars += num_chars;
- }
- while (lip != NULL && line_diff < distance && s_IsBlank (lip->data)) {
- lip = lip->next;
- line_diff ++;
- }
- /* skip over line we would need for ID */
- if (lip != NULL) {
- lip = lip->next;
- line_diff ++;
- }
-
- if (lip == NULL || line_diff == distance) {
- return 0;
- }
- num_starts = 1;
- list = lip->next;
- start_of_unknown = line_diff;
- length_list = NULL;
- total_chars = 0;
- for (lip = list;
- lip != NULL && line_diff < distance;
- lip = lip->next, line_diff++)
- {
- num_chars = strlen (lip->data);
- length = s_IntLinkNew (num_chars, length_list);
- if (length_list == NULL) {
- length_list = length;
- }
- total_chars += num_chars;
- }
- /* how many offsets do we need? */
- num_additional_offsets_needed = (total_chars / desired_num_chars);
- if (num_additional_offsets_needed == 0) {
- return 0;
- }
- /* Find all the places you could start and get the exact right number
- * of characters
- */
- start_list = NULL;
- num_starts = 0;
- pattern_length = 0;
- for (start_ptr = length_list, line_diff = start_of_unknown;
- start_ptr != NULL && line_diff < distance
- && pattern_length < distance - line_diff ;
- start_ptr = start_ptr->next, line_diff++) {
- num_chars = start_ptr->ival;
- pattern_length = 1;
- length = start_ptr->next;
- while (num_chars < desired_num_chars
- && pattern_length + line_diff < distance
- && length != NULL)
- {
- num_chars += length->ival;
- pattern_length ++;
- length = length->next;
- }
- if (num_chars == desired_num_chars) {
- length = s_IntLinkNew (line_diff, start_list);
- if (start_list == NULL) {
- start_list = length;
- }
- num_starts ++;
- }
- }
- /* now select best set of start points */
-
- s_IntLinkFree (length_list);
- s_IntLinkFree (start_list);
- return 0;
- }
- /* This function inserts new block locations into the offset_list
- * by looking for likely starts of abnormal patterns.
- */
- static void s_InsertNewOffsets
- (TLineInfoPtr token_list,
- TIntLinkPtr offset_list,
- int block_length,
- int best_num_chars,
- char * alphabet)
- {
- TLineInfoPtr lip, prev_start;
- TIntLinkPtr prev_offset, new_offset, splice_offset;
- int line_diff, num_chars, line_start;
- if (token_list == NULL || offset_list == NULL
- || block_length < 1 || best_num_chars < 1)
- {
- return;
- }
- lip = token_list;
- prev_offset = NULL;
- for (new_offset = offset_list;
- new_offset != NULL && lip != NULL;
- new_offset = new_offset->next) {
- if (prev_offset == NULL) {
- /* just advance through tokens */
- for (line_diff = 0;
- line_diff < new_offset->ival && lip != NULL;
- line_diff ++) {
- lip = lip->next;
- }
- } else {
- if (new_offset->ival - prev_offset->ival == block_length) {
- /* just advance through tokens */
- for (line_diff = 0;
- line_diff < new_offset->ival - prev_offset->ival
- && lip != NULL;
- line_diff ++) {
- lip = lip->next;
- }
- } else {
- /* look for intermediate breaks */
- prev_start = lip;
- num_chars = 0;
- for (line_diff = 0;
- line_diff < new_offset->ival - prev_offset->ival
- && lip != NULL && num_chars < best_num_chars;
- line_diff ++) {
- num_chars += strlen (lip->data);
- lip = lip->next;
- }
- if (lip == NULL) {
- return;
- }
- /* set new offset at first line of next pattern */
- line_diff ++;
- lip = lip->next;
- if (line_diff < new_offset->ival - prev_offset->ival) {
- line_start = line_diff + prev_offset->ival;
- /* advance token pointer to new piece */
- while (line_diff < new_offset->ival - prev_offset->ival
- && lip != NULL)
- {
- lip = lip->next;
- line_diff ++;
- }
- /* insert new offset value */
- splice_offset = s_IntLinkNew (line_start, NULL);
- if (splice_offset == NULL) {
- return;
- }
- splice_offset->next = new_offset;
- prev_offset->next = splice_offset;
- s_CountCharactersBetweenOffsets (lip,
- new_offset->ival - splice_offset->ival,
- best_num_chars);
- }
- }
- }
- prev_offset = new_offset;
- }
-
- /* iterate through the last block */
- for (line_diff = 0;
- line_diff < block_length && lip != NULL;
- line_diff ++) {
- lip = lip->next;
- }
- /* if we have room for one more sequence, or even most of one more sequence, add it */
- if (lip != NULL && ! s_SkippableString (lip->data)) {
- splice_offset = s_IntLinkNew (line_diff + prev_offset->ival, prev_offset);
- }
- }
- /* This function returns true if the string contains digits, false otherwise */
- static EBool s_ContainsDigits (char *data)
- {
- char *cp;
- if (data == NULL) return eFalse;
- for (cp = data; *cp != 0; cp++) {
- if (isdigit (*cp)) {
- return eTrue;
- }
- }
- return eFalse;
- }
- /* This function processes the alignment file data by dividing the original
- * lines into pieces based on whitespace and looking for patterns of length
- * in the data.
- */
- static void s_ProcessAlignFileRawByLengthPattern (SAlignRawFilePtr afrp)
- {
- TLineInfoPtr token_list;
- SLengthListPtr list;
- TLineInfoPtr lip;
- SLengthListPtr anchorpattern[2];
- TIntLinkPtr offset_list;
- int best_length;
- int best_num_chars;
- if (afrp == NULL || afrp->line_list == NULL) {
- return;
- }
- token_list = s_BuildTokenList (afrp->line_list);
- token_list = s_RemoveCommentsFromTokens (token_list);
- token_list = s_RemoveNexusCommentsFromTokens (token_list);
- list = s_LengthListNew ( NULL );
- for (lip = token_list;
- lip != NULL && ! s_FoundStopLine (lip->data);
- lip = lip->next)
- {
- if (s_SkippableString (lip->data) || s_ContainsDigits(lip->data)) {
- s_AddLengthRepeat (list, 0);
- } else {
- s_AddLengthRepeat (list, strlen (lip->data));
- }
- }
- anchorpattern [0] = s_FindMostPopularPattern (list->lengthrepeats);
- anchorpattern [1] = NULL;
- if (anchorpattern [0] == NULL || anchorpattern[0]->lengthrepeats == NULL) {
- return;
- }
- /* find anchor patterns in original list,
- * find distances between anchor patterns
- */
- offset_list = s_CreateOffsetList (list->lengthrepeats, anchorpattern[0]);
- offset_list = s_AugmentOffsetList (offset_list,
- list->lengthrepeats,
- anchorpattern[0]);
- /* resolve unusual distances between anchor patterns */
- best_length = s_GetMostPopularPatternLength (offset_list);
- if (best_length < 1 && offset_list != NULL && offset_list->next != NULL) {
- best_length = offset_list->next->ival - offset_list->ival;
- }
- best_num_chars = s_GetBestCharacterLength (token_list, offset_list,
- best_length);
- s_InsertNewOffsets (token_list, offset_list, best_length, best_num_chars,
- afrp->alphabet);
- /* use token before each anchor pattern as ID, use tokens for distance
- * between anchor patterns for sequence data
- */
- s_CreateSequencesBasedOnTokenPatterns (token_list, offset_list,
- anchorpattern, afrp);
-
- s_LengthListFree (anchorpattern[0]);
- s_LengthListFree (list);
- s_LineInfoFree (token_list);
- }
- /* The following functions are used to convert data from the internal
- * representation into the form that will be passed to the calling
- * program. Information from the ID strings is parsed to remove
- * definition lines and organism information, the gap characters are
- * standardized to '-', the missing characters are standardizes to 'N',
- * match characters are replaced with characters from the first record,
- * and bad characters are reported.
- */
- /* This function allocates memory for a new AligmentFileData structure
- * and initializes its member variables.
- */
- extern TAlignmentFilePtr AlignmentFileNew (void)
- {
- TAlignmentFilePtr afp;
- afp = (TAlignmentFilePtr) malloc (sizeof (SAlignmentFile));
- if (afp == NULL) {
- return NULL;
- }
- afp->num_sequences = 0;
- afp->num_organisms = 0;
- afp->num_deflines = 0;
- afp->num_segments = 0;
- afp->ids = NULL;
- afp->sequences = NULL;
- afp->organisms = NULL;
- afp->deflines = NULL;
- return afp;
- }
- /* This function frees the memory associated with an AligmentFileData
- * structure and its member variables.
- */
- extern void AlignmentFileFree (TAlignmentFilePtr afp)
- {
- int index;
- if (afp == NULL) {
- return;
- }
- if (afp->ids != NULL) {
- for (index = 0; index < afp->num_sequences; index++) {
- free (afp->ids [index]);
- }
- free (afp->ids);
- afp->ids = NULL;
- }
- if (afp->sequences != NULL) {
- for (index = 0; index < afp->num_sequences; index++) {
- free (afp->sequences [index]);
- }
- free (afp->sequences);
- afp->sequences = NULL;
- }
- if (afp->organisms != NULL) {
- for (index = 0; index < afp->num_organisms; index++) {
- free (afp->organisms [index]);
- }
- free (afp->organisms);
- afp->sequences = NULL;
- }
- if (afp->deflines != NULL) {
- for (index = 0; index < afp->num_deflines; index++) {
- free (afp->deflines [index]);
- }
- free (afp->deflines);
- afp->deflines = NULL;
- }
- free (afp);
- }
- /* This function parses the identifier string used by the alignment file
- * to identify a sequence to find the portion of the string that is actually
- * an ID, as opposed to organism information or definition line.
- */
- static char * s_GetIdFromString (char * str)
- {
- char * cp;
- char * id;
- int len;
- if (str == NULL) {
- return NULL;
- }
- cp = str;
- cp += strspn (str, " >t");
- len = strcspn (cp, " trn");
- if (len == 0) {
- return NULL;
- }
- id = malloc (len + 1);
- if (id == NULL) {
- return NULL;
- }
- strncpy (id, cp, len);
- id [ len ] = 0;
- return id;
- }
- /* This function pulls defline information from the ID string, if there is
- * any.
- */
- static char * s_GetDeflineFromIdString (char * str)
- {
- char * cp;
- int len;
- if (str == NULL) {
- return NULL;
- }
- cp = str;
- cp += strspn (str, " >t");
- len = strcspn (cp, " trn");
- if (len == 0) {
- return NULL;
- }
- cp += len;
- len = strspn (cp, " trn");
- if (len == 0) {
- return NULL;
- }
- cp += len;
- if (*cp == 0) {
- return NULL;
- }
- return strdup (cp);
- }
- /* This function takes the ID strings read from the file and parses them
- * to obtain a defline (if there is extra text after the ID and/or
- * organism information) and to obtain the actual ID for the sequence.
- */
- static EBool s_ReprocessIds (SAlignRawFilePtr afrp)
- {
- TStringCountPtr list, scp;
- TAlignRawSeqPtr arsp;
- TLineInfoPtr lip;
- char * id;
- int line_num;
- EBool rval = eTrue;
- char * defline;
- if (afrp == NULL) {
- return eFalse;
- }
- list = NULL;
- lip = afrp->deflines;
- for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
- if (arsp->id_lines != NULL) {
- line_num = arsp->id_lines->ival;
- } else {
- line_num = -1;
- }
- s_RemoveOrganismCommentFromLine (arsp->id);
- id = s_GetIdFromString (arsp->id);
- if (lip == NULL) {
- defline = s_GetDeflineFromIdString (arsp->id);
- afrp->deflines = s_AddLineInfo (afrp->deflines, defline,
- line_num, 0);
- free (defline);
- afrp->num_deflines ++;
- }
- free (arsp->id);
- arsp->id = id;
- list = s_AddStringCount (arsp->id, line_num, list);
- }
- for (scp = list; scp != NULL; scp = scp->next) {
- if (scp->num_appearances > 1) {
- rval = eFalse;
- s_ReportRepeatedId (scp, afrp->report_error,
- afrp->report_error_userdata);
- }
- }
- return rval;
- }
- /* This function reports unacceptable characters in a sequence. Frequently
- * there will be more than one character of the same kind (for instance,
- * when the user has incorrectly specified a gap character), so repeated
- * characters are reported together. The function advances the data
- * position in the SLineInfoReader structure lirp, and returns the
- * current data position for lirp.
- */
- static int
- s_ReportRepeatedBadCharsInSequence
- (TLineInfoReaderPtr lirp,
- char * id,
- char * reason,
- FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- int bad_line_num, bad_line_offset;
- int num_bad_chars;
- char bad_char, curr_char;
- int data_position;
- bad_line_num = s_LineInfoReaderGetCurrentLineNumber (lirp);
- bad_line_offset = s_LineInfoReaderGetCurrentLineOffset (lirp);
- bad_char = *lirp->curr_line_pos;
- num_bad_chars = 1;
- data_position = lirp->data_pos + 1;
- while ((curr_char = s_FindNthDataChar (lirp, data_position)) == bad_char) {
- num_bad_chars ++;
- data_position ++;
- }
- s_ReportBadCharError (id, bad_char, num_bad_chars,
- bad_line_offset, bad_line_num, reason,
- report_error, report_error_userdata);
- return data_position;
- }
- /* This function does context-sensitive replacement of the missing,
- * match, and gap characters and also identifies bad characters.
- * Gap characters found in the wrong location in the sequence are
- * considered an error. Characters that are not missing, match, or
- * gap characters and are not in the specified sequence alphabet are
- * reported as errors. Match characters in the first sequence are also
- * reported as errors.
- * The function will return eTrue if any errors were found, or eFalse
- * if there were no errors.
- */
- static EBool
- s_FindBadDataCharsInSequence
- (TAlignRawSeqPtr arsp,
- TAlignRawSeqPtr master_arsp,
- TSequenceInfoPtr sip,
- FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TLineInfoReaderPtr lirp, master_lirp;
- int data_position;
- int middle_start;
- int middle_end;
- char curr_char, master_char;
- EBool found_middle_start;
- EBool rval = eFalse;
- EBool match_not_in_beginning_gap;
- EBool match_not_in_end_gap;
- if (arsp == NULL || master_arsp == NULL || sip == NULL) {
- return eTrue;
- }
- lirp = s_LineInfoReaderNew (arsp->sequence_data);
- if (lirp == NULL) {
- return eTrue;
- }
- if (arsp != master_arsp) {
- master_lirp = s_LineInfoReaderNew (master_arsp->sequence_data);
- if (master_lirp == NULL) {
- s_LineInfoReaderFree (lirp);
- return eTrue;
- }
- } else {
- master_lirp = NULL;
- }
- if (strcspn (sip->beginning_gap, sip->match)
- == strlen (sip->beginning_gap)) {
- match_not_in_beginning_gap = eTrue;
- } else {
- match_not_in_beginning_gap = eFalse;
- }
- if (strcspn (sip->end_gap, sip->match) == strlen (sip->end_gap)) {
- match_not_in_end_gap = eTrue;
- } else {
- match_not_in_end_gap = eFalse;
- }
- /* First, find middle start and end positions and report characters
- * that are not beginning gap before the middle
- */
- found_middle_start = eFalse;
- data_position = 0;
- curr_char = s_FindNthDataChar (lirp, data_position);
- while (curr_char != 0) {
- if (strchr (sip->alphabet, curr_char) != NULL) {
- if (! found_middle_start) {
- middle_start = data_position;
- found_middle_start = eTrue;
- }
- middle_end = data_position + 1;
- data_position ++;
- } else if (! found_middle_start) {
- if (match_not_in_beginning_gap
- && strchr (sip->match, curr_char) != NULL)
- {
- middle_start = data_position;
- found_middle_start = eTrue;
- middle_end = data_position + 1;
- data_position ++;
- } else if (strchr (sip->beginning_gap, curr_char) == NULL) {
- /* Report error - found character that is not beginning gap
- in beginning gap */
- data_position = s_ReportRepeatedBadCharsInSequence (lirp,
- arsp->id,
- "expect only beginning gap characters here",
- report_error, report_error_userdata);
- rval = eTrue;
- } else {
- *lirp->curr_line_pos = '-';
- data_position ++;
- }
- } else {
- if (match_not_in_end_gap
- && strchr (sip->match, curr_char) != NULL)
- {
- middle_end = data_position + 1;
- }
- data_position ++;
- }
- curr_char = s_FindNthDataChar (lirp, data_position);
- }
- if (! found_middle_start) {
- s_ReportMissingSequenceData (arsp->id,
- report_error, report_error_userdata);
- s_LineInfoReaderFree (lirp);
- return eTrue;
- }
- /* Now complain about bad middle characters */
- data_position = middle_start;
- while (data_position < middle_end)
- {
- curr_char = s_FindNthDataChar (lirp, data_position);
- while (data_position < middle_end
- && strchr (sip->alphabet, curr_char) != NULL) {
- data_position ++;
- curr_char = s_FindNthDataChar (lirp, data_position);
- }
- if (curr_char == 0 || data_position >= middle_end) {
- /* do nothing, done with middle */
- } else if (strchr (sip->missing, curr_char) != NULL) {
- *lirp->curr_line_pos = 'N';
- data_position ++;
- } else if (strchr (sip->match, curr_char) != NULL) {
- master_char = s_FindNthDataChar (master_lirp, data_position);
- if (master_char == 0) {
- /* report error - unable to get master char */
- if (master_arsp == arsp) {
- data_position = s_ReportRepeatedBadCharsInSequence (lirp,
- arsp->id,
- "can't specify match chars in first sequence",
- report_error, report_error_userdata);
- } else {
- data_position = s_ReportRepeatedBadCharsInSequence (lirp,
- arsp->id,
- "can't find source for match chars",
- report_error, report_error_userdata);
- }
- rval = eTrue;
- } else {
- *lirp->curr_line_pos = master_char;
- data_position ++;
- }
- } else if (strchr (sip->middle_gap, curr_char) != NULL) {
- *lirp->curr_line_pos = '-';
- data_position ++;
- } else {
- /* Report error - found bad character in middle */
- data_position = s_ReportRepeatedBadCharsInSequence (lirp,
- arsp->id,
- "expect only sequence, missing, match,"
- " and middle gap characters here",
- report_error, report_error_userdata);
- rval = eTrue;
- }
- }
- /* Now find and complain about end characters */
- data_position = middle_end;
- curr_char = s_FindNthDataChar (lirp, data_position);
- while (curr_char != 0) {
- if (strchr (sip->end_gap, curr_char) == NULL) {
- /* Report error - found bad character in middle */
- data_position = s_ReportRepeatedBadCharsInSequence (lirp, arsp->id,
- "expect only end gap characters here",
- report_error, report_error_userdata);
- rval = eTrue;
- } else {
- *lirp->curr_line_pos = '-';
- data_position++;
- }
- curr_char = s_FindNthDataChar (lirp, data_position);
- }
- s_LineInfoReaderFree (lirp);
- s_LineInfoReaderFree (master_lirp);
- return rval;
- }
- /* This function examines each sequence and replaces the special characters
- * and reports bad characters in each one. The function will return eTrue
- * if any of the sequences contained bad characters or eFalse if no errors
- * were seen.
- */
- static EBool
- s_s_FindBadDataCharsInSequenceList
- (SAlignRawFilePtr afrp,
- TSequenceInfoPtr sip)
- {
- TAlignRawSeqPtr arsp;
- EBool rval = eFalse;
- if (afrp == NULL || afrp->sequences == NULL) {
- return eTrue;
- }
- for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
- if (s_FindBadDataCharsInSequence (arsp, afrp->sequences, sip,
- afrp->report_error,
- afrp->report_error_userdata)) {
- rval = eTrue;
- }
- }
- return rval;
- }
- /* This function examines the organisms listed for the alignment and determines
- * whether any of the organism names (including the associated comments) are
- * repeated.
- */
- static EBool s_AreOrganismsUnique (SAlignRawFilePtr afrp)
- {
- TLineInfoPtr this_org, lip;
- TAlignRawSeqPtr arsp;
- EBool are_unique;
- if (afrp == NULL || afrp->num_organisms == 0
- || afrp->organisms == NULL) {
- return eFalse;
- }
- are_unique = eTrue;
- for (this_org = afrp->organisms;
- this_org != NULL;
- this_org = this_org->next) {
- lip = afrp->organisms;
- arsp = afrp->sequences;
- while (lip != NULL && lip != this_org
- && strcmp (lip->data, this_org->data) != 0 && arsp != NULL) {
- lip = lip->next;
- arsp = arsp->next;
- }
- if (lip != NULL && lip != this_org) {
- are_unique = eFalse;
- s_ReportRepeatedOrganismName (arsp->id, this_org->line_num,
- lip->line_num,
- this_org->data,
- afrp->report_error,
- afrp->report_error_userdata);
- }
- }
- return are_unique;
- }
- /* This function reports whether the definition lines are identical for
- * each sequence or not.
- */
- static EBool s_AreDeflinesIdentical (SAlignRawFilePtr afrp)
- {
- TLineInfoPtr lip;
- TStringCountPtr list;
- EBool rval;
- if (afrp == NULL) {
- return eFalse;
- }
- list = NULL;
- for (lip = afrp->deflines; lip != NULL; lip = lip->next) {
- list = s_AddStringCount (lip->data, lip->line_num, list);
- }
- rval = eTrue;
- if (list != NULL && list->next != NULL) {
- rval = eFalse;
- s_ReportDefinitionLineMismatch (afrp->report_error,
- afrp->report_error_userdata);
- s_ReportDefinitionLines (list, afrp->report_error,
- afrp->report_error_userdata);
- }
- s_StringCountFree (list);
- return rval;
- }
- /* This function uses the contents of an SAlignRawFileData structure to
- * create an SAlignmentFile structure with the appropriate information.
- */
- static TAlignmentFilePtr
- s_ConvertDataToOutput
- (SAlignRawFilePtr afrp,
- TSequenceInfoPtr sip)
- {
- TAlignRawSeqPtr arsp;
- int index;
- TSizeInfoPtr * lengths;
- int * best_length;
- TAlignmentFilePtr afp;
- TLineInfoPtr lip;
- int curr_seg;
- if (afrp == NULL || sip == NULL || afrp->sequences == NULL) {
- return NULL;
- }
- afp = AlignmentFileNew ();
- if (afp == NULL) {
- return NULL;
- }
- afp->num_organisms = afrp->num_organisms;
- afp->num_deflines = afrp->num_deflines;
- afp->num_segments = afrp->num_segments;
- afp->num_sequences = 0;
- lengths = NULL;
- for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
- afp->num_sequences++;
- }
- if (afp->num_sequences != afrp->num_organisms
- && afp->num_sequences / afp->num_segments != afrp->num_organisms) {
- s_ReportMissingOrganismInfo (afrp->report_error,
- afrp->report_error_userdata);
- } else {
- s_AreOrganismsUnique (afrp);
- }
- afp->sequences = (char **)malloc (afp->num_sequences
- * sizeof (char *));
- if (afp->sequences == NULL) {
- AlignmentFileFree (afp);
- return NULL;
- }
- afp->ids = (char **)malloc (afp->num_sequences * sizeof (char *));
- if (afp->ids == NULL) {
- AlignmentFileFree (afp);
- return NULL;
- }
- if (afp->num_organisms > 0) {
- afp->organisms = (char **) malloc (afp->num_organisms
- * sizeof (char *));
- if (afp->organisms == NULL) {
- AlignmentFileFree (afp);
- return NULL;
- }
- }
- if (afp->num_deflines > 0) {
- afp->deflines = (char **)malloc (afp->num_deflines
- * sizeof (char *));
- if (afp->deflines == NULL) {
- AlignmentFileFree (afp);
- return NULL;
- }
- }
- /* copy in deflines */
- for (lip = afrp->deflines, index = 0;
- lip != NULL && index < afp->num_deflines;
- lip = lip->next, index++) {
- if (lip->data == NULL) {
- afp->deflines [index] = NULL;
- } else {
- afp->deflines [index] = strdup (lip->data);
- }
- }
- while (index < afp->num_deflines) {
- afp->deflines [index ++] = NULL;
- }
- /* copy in organism information */
- for (lip = afrp->organisms, index = 0;
- lip != NULL && index < afp->num_organisms;
- lip = lip->next, index++) {
- afp->organisms [index] = strdup (lip->data);
- }
-
- /* we need to store length information about different segments separately */
- lengths = (TSizeInfoPtr *) malloc (sizeof (TSizeInfoPtr) * afrp->num_segments);
- if (lengths == NULL) {
- AlignmentFileFree (afp);
- return NULL;
- }
- best_length = (int *) malloc (sizeof (int) * afrp->num_segments);
- if (best_length == NULL) {
- free (lengths);
- AlignmentFileFree (afp);
- return NULL;
- }
- for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++) {
- lengths [curr_seg] = NULL;
- best_length [curr_seg] = 0;
- }
-
- /* copy in sequence data */
- curr_seg = 0;
- for (arsp = afrp->sequences, index = 0;
- arsp != NULL && index < afp->num_sequences;
- arsp = arsp->next, index++) {
- afp->sequences [index] =
- s_LineInfoMergeAndStripSpaces (arsp->sequence_data);
- if (afp->sequences [index] != NULL) {
- lengths [curr_seg] = s_AddSizeInfo (lengths [curr_seg], strlen (afp->sequences [index]));
- }
- afp->ids [index] = strdup (arsp->id);
- curr_seg ++;
- if (curr_seg >= afrp->num_segments) {
- curr_seg = 0;
- }
- }
- for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
- {
- best_length [curr_seg] = s_GetMostPopularSize (lengths [curr_seg]);
- if (best_length [curr_seg] == 0 && lengths [curr_seg] != NULL) {
- best_length [curr_seg] = lengths [curr_seg]->size_value;
- }
- }
- curr_seg = 0;
- for (index = 0; index < afp->num_sequences; index++) {
- if (afp->sequences [index] == NULL) {
- s_ReportMissingSequenceData (afp->ids [index],
- afrp->report_error,
- afrp->report_error_userdata);
- } else if ((int) strlen (afp->sequences [index]) != best_length [curr_seg]) {
- s_ReportBadSequenceLength (afp->ids [index], best_length [curr_seg],
- strlen (afp->sequences [index]),
- afrp->report_error,
- afrp->report_error_userdata);
- }
- curr_seg ++;
- if (curr_seg >= afrp->num_segments) {
- curr_seg = 0;
- }
- }
- if (afrp->expected_num_sequence > 0
- && afrp->expected_num_sequence != afp->num_sequences)
- {
- s_ReportIncorrectNumberOfSequences (afrp->expected_num_sequence,
- afp->num_sequences,
- afrp->report_error,
- afrp->report_error_userdata);
- }
- if (afrp->expected_sequence_len > 0
- && afrp->expected_sequence_len != best_length [0])
- {
- s_ReportIncorrectSequenceLength (afrp->expected_sequence_len,
- best_length [0],
- afrp->report_error,
- afrp->report_error_userdata);
- }
-
- free (best_length);
- for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
- {
- s_SizeInfoFree (lengths [curr_seg]);
- }
- free (lengths);
-
- return afp;
- }
- /* This is the function called by the calling program to read an alignment
- * file. The readfunc argument is a function pointer supplied by the
- * calling program which this library will use to read in data from the
- * file one line at a time. The fileuserdata argument is a pointer to
- * data used by the calling program's readfunc function and will be passed
- * back with each call to readfunc.
- * The errfunc argument is a function pointer supplied by the calling
- * program for reporting errors. The erroruserdata argument is a pointer
- * to data used by the calling program's errfunc function and will be
- * passed back with each call to readfunc.
- * The sequence_info argument contains the sequence alphabet and missing,
- * match, and gap characters to use in interpreting the sequence data.
- */
- extern TAlignmentFilePtr
- ReadAlignmentFile
- (FReadLineFunction readfunc,
- void * fileuserdata,
- FReportErrorFunction errfunc,
- void * erroruserdata,
- TSequenceInfoPtr sequence_info)
- {
- SAlignRawFilePtr afrp;
- TAlignmentFilePtr afp;
- if (sequence_info == NULL || sequence_info->alphabet == NULL) {
- return NULL;
- }
- afrp = s_ReadAlignFileRaw ( readfunc, fileuserdata, sequence_info,
- errfunc, erroruserdata);
- if (afrp == NULL) {
- return NULL;
- }
- if (afrp->block_size > 1) {
- s_ProcessAlignRawFileByBlockOffsets (afrp);
- } else if (afrp->marked_ids) {
- s_ProcessAlignFileRawForMarkedIDs (afrp);
- } else {
- s_ProcessAlignFileRawByLengthPattern (afrp);
- }
- s_ReprocessIds (afrp);
- #if 0 /* this step was removed by indexer request */
- /* Note - have to check deflines after reprocessing IDs */
- s_AreDeflinesIdentical (afrp);
- #endif
- if (s_s_FindBadDataCharsInSequenceList (afrp, sequence_info)) {
- s_AlignFileRawFree (afrp);
- return NULL;
- }
- afp = s_ConvertDataToOutput (afrp, sequence_info);
- s_AlignFileRawFree (afrp);
-
- return afp;
- }
- /*
- * ===========================================================================
- * $Log: alnread.c,v $
- * Revision 1000.1 2004/06/01 19:41:15 gouriano
- * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.10
- *
- * Revision 1.10 2004/05/20 19:40:24 bollin
- * Made chnages to allow reading of alignments of segmented sets.
- * Also added warnings for when organism lines may be present but improperly
- * formatted.
- *
- * Revision 1.9 2004/03/16 21:05:15 bollin
- * Added some improvements to the portion of the alignment reader that deals
- * with contiguous alignments that do not have a '>' at the beginning of each
- * ID.
- *
- * Revision 1.8 2004/03/16 16:25:38 bollin
- * Added function to recognize a file as ASN.1 and reject immediately
- *
- * Revision 1.7 2004/03/09 21:27:39 bollin
- * in s_InsertNewOffsets, if the list ends while searching for the next pattern, exit immediately (prevents NULL pointer access)
- *
- * Revision 1.6 2004/03/04 19:15:07 bollin
- * file reading now skips over multi-line bracketed comments
- *
- * Revision 1.5 2004/03/04 16:29:32 bollin
- * added skip of taxa comment for PAUP format alignment files
- *
- * Revision 1.4 2004/02/10 16:15:13 bollin
- * now checks for unused lines when finding interleaved blocks, will reject and try other methods if unused lines found after first block found.
- *
- * Revision 1.3 2004/02/05 16:29:32 bollin
- * smarter function for skipping NEXUS comment lines
- *
- * Revision 1.2 2004/02/04 19:49:11 bollin
- * fixed infinite loop condition in s_AugmentOffsetList, properly skip over first non-space column when looking for interleaved block patterns in s_ReadAlignFileRaw
- *
- * Revision 1.1 2004/02/03 16:47:02 ucko
- * Add Colleen Bollin's Toolkit-independent alignment reader.
- *
- * Revision 1.38 2004/01/30 22:46:08 bollin
- * renamed defined variable, fixed typo in comment
- *
- * Revision 1.37 2004/01/30 21:48:14 bollin
- * changes for compatibility with Windows
- *
- * Revision 1.36 2004/01/30 21:33:41 bollin
- * replaced strncasecmp and strncase function calls
- *
- * Revision 1.35 2004/01/29 19:16:27 bollin
- * use EBool for boolean values
- *
- * Revision 1.34 2004/01/29 17:58:11 bollin
- * aligned assignment blocks in New functions
- *
- * Revision 1.33 2004/01/29 17:43:40 bollin
- * added directory specification to alnread.h include line
- *
- * Revision 1.32 2004/01/29 17:41:29 bollin
- * added comment block, id tags, log
- *
- * ===========================================================================
- */