生物技术

开发平台：

C/C++

alnread.c：源码内容

return offset;
}
}
return -1;
}
/* This function returns a pointer to the memory in which the ID for the
* Nth sequence is stored, unless there aren't that many sequences, in which
* case NULL is returned.
*/
static char *
s_GetAlignRawSeqIDByOffset
(TAlignRawSeqPtr list,
int offset)
{
TAlignRawSeqPtr arsp;
int index;
arsp = list;
index = 0;
while ( arsp != NULL && index != offset ) {
arsp = arsp->next;
index++;
}
if (index == offset && arsp != NULL) {
return arsp->id;
} else {
return NULL;
}
}
/* This function adds data to a sequence by looking for the specified ID in
* the list. If the id is not found, a new sequence with that ID is added to
* the end of the list.
* The function returns a pointer to the first item in the list.
*/
static TAlignRawSeqPtr
s_AddAlignRawSeqById
(TAlignRawSeqPtr list,
char * id,
char * data,
int id_line_num,
int data_line_num,
int data_line_offset)
{
TAlignRawSeqPtr arsp;
TIntLinkPtr ilp;
arsp = s_FindAlignRawSeqById (list, id);
if (arsp == NULL) {
arsp = s_AlignRawSeqNew (list);
if (arsp == NULL) {
return NULL;
}
if (list == NULL) list = arsp;
arsp->id = strdup (id);
}
arsp->sequence_data = s_AddLineInfo (arsp->sequence_data,
data,
data_line_num,
data_line_offset);
ilp = s_IntLinkNew (id_line_num, arsp->id_lines);
if (arsp->id_lines == NULL) arsp->id_lines = ilp;
return list;
}
/* This function adds data to the Nth sequence in the sequence list and
* returns eTrue, unless there aren't that many sequences in the list, in
* which case the function returns eFalse.
*/
static EBool
s_AddAlignRawSeqByIndex
(TAlignRawSeqPtr list,
int index,
char * data,
int data_line_num,
int data_line_offset)
{
TAlignRawSeqPtr arsp;
int curr;
curr = 0;
for (arsp = list; arsp != NULL && curr < index; arsp = arsp->next) {
curr++;
}
if (arsp == NULL) {
return eFalse;
} else {
arsp->sequence_data = s_AddLineInfo (arsp->sequence_data,
data,
data_line_num,
data_line_offset);
return eTrue;
}
}
/* This function frees memory associated with the SAlignRawFileData structure.
*/
static void s_AlignFileRawFree (SAlignRawFilePtr afrp)
{
if (afrp == NULL) {
return;
}
s_LineInfoFree (afrp->organisms);
s_LineInfoFree (afrp->deflines);
s_LineInfoFree (afrp->line_list);
s_AlignRawSeqFree (afrp->sequences);
s_IntLinkFree (afrp->offset_list);
free (afrp);
}
/* This function allocates memory for an SAlignRawFileData structure and
* initializes its member variables. The function returns a pointer to
* the newly allocated structure.
*/
static SAlignRawFilePtr s_AlignFileRawNew (void)
{
SAlignRawFilePtr afrp;
afrp = (SAlignRawFilePtr)malloc (sizeof (SAlignRawFileData));
if (afrp == NULL) {
return NULL;
}
afrp->marked_ids = eFalse;
afrp->line_list = NULL;
afrp->organisms = NULL;
afrp->num_organisms = 0;
afrp->deflines = NULL;
afrp->num_deflines = 0;
afrp->block_size = 0;
afrp->offset_list = NULL;
afrp->sequences = NULL;
afrp->report_error = NULL;
afrp->report_error_userdata = NULL;
afrp->alphabet = NULL;
afrp->expected_num_sequence = 0;
afrp->expected_sequence_len = 0;
afrp->num_segments = 1;
return afrp;
}
/* The following functions are used to analyze the structure of a file and
* assemble the sequences listed in the file.
* Sequence data in a file is organized in one of two general formats -
* interleaved or contiguous. Interleaved data can be recognized by looking
* for repeated blocks of the same number of lines within a file separated
* by blank or skippable lines from other lines in the file. The first of
* these blocks must have at least two elements separated by whitespace
* in each line, the first of these elements is the ID for the sequence in
* that row and for the sequences in that position within the block for the
* remainder of the file.
* Contiguous data can be recognized by either looking for "marked" sequence
* IDs, which begin with a '>' character, or by looking for repeated patterns
* of lines with the same numbers of characters.
*/
/* The following functions are used to analyze interleaved data. */
/* This function creates a SLengthListData structure that describes the pattern
* of character lengths in the string pointed to by cp.
*/
static SLengthListPtr s_GetBlockPattern (char * cp)
{
SLengthListPtr this_pattern;
int len;
this_pattern = s_LengthListNew (NULL);
if (this_pattern == NULL) {
return NULL;
}
this_pattern->num_appearances = 1;
while (*cp != 0) {
len = strcspn (cp, " tr");
s_AddLengthRepeat (this_pattern, len);
cp += len;
cp += strspn (cp, " tr");
}
return this_pattern;
}
/* This function attempts to predict whether the following lines will be
* an interleaved block. If so, the function returns the location of the
* beginning of the block, otherwise the function returns -1.
*/
static int
s_ForecastBlockPattern
(SLengthListPtr pattern_list,
TIntLinkPtr next_offset,
int line_start,
int block_size)
{
int line_counter;
SLengthListPtr llp;
line_counter = line_start;
if (next_offset != NULL
&& next_offset->ival - line_counter < block_size) {
return -1;
}
for (llp = pattern_list;
llp != NULL
&& (next_offset == NULL || line_counter < next_offset->ival - 1)
&& line_counter - line_start < block_size;
llp = llp->next)
{
if (llp->lengthrepeats == NULL) {
return -1;
}
line_counter += llp->num_appearances;
}
if (line_counter - line_start == block_size) {
if (llp->next == NULL) {
return line_start;
}
llp = llp->next;
if (llp->lengthrepeats == NULL) {
return line_start;
}
}
return -1;
}
/* This function looks for malformed blocks between the identified blocks
* indicated by the offset_list. It returns a pointer to the list with the
* new locations inserted at the appropriate locations.
*/
static TIntLinkPtr
s_AugmentBlockPatternOffsetList
(SLengthListPtr pattern_list,
TIntLinkPtr offset_list,
int block_size)
{
int line_counter;
SLengthListPtr llp;
TIntLinkPtr next_offset, prev_offset, new_offset;
int forecast_pos;
prev_offset = NULL;
next_offset = offset_list;
line_counter = 0;
llp = pattern_list;
while (llp != NULL) {
if (next_offset != NULL && line_counter == next_offset->ival) {
prev_offset = next_offset;
next_offset = next_offset->next;
/* skip past the lines for this block */
while (line_counter - prev_offset->ival < block_size
&& llp != NULL)
{
line_counter += llp->num_appearances;
llp = llp->next;
}
} else {
forecast_pos = s_ForecastBlockPattern (llp, next_offset,
line_counter,
block_size);
if (forecast_pos > 0) {
new_offset = s_IntLinkNew (forecast_pos, NULL);
if (new_offset == NULL) {
return NULL;
}
if (prev_offset == NULL) {
new_offset->next = offset_list;
offset_list = new_offset;
} else {
new_offset->next = next_offset;
prev_offset->next = new_offset;
}
prev_offset = new_offset;
/* skip past the lines for this block */
while (line_counter - prev_offset->ival < block_size
&& llp != NULL)
{
line_counter += llp->num_appearances;
llp = llp->next;
}
} else {
line_counter += llp->num_appearances;
llp = llp->next;
}
}
}
return offset_list;
}
/* This function looks for lines that could not be assigned to an interleaved
* block. It returns eTrue if it finds any such lines after the first offset,
* eFalse otherwise, and reports all instances of unused lines as errors.
*/
static EBool
s_FindUnusedLines
(SLengthListPtr pattern_list,
SAlignRawFilePtr afrp)
{
TIntLinkPtr offset;
SLengthListPtr llp;
int line_counter;
int block_line_counter;
EBool rval = eFalse;
TLineInfoPtr line_val;
int skip;
if (pattern_list == NULL || afrp == NULL
|| afrp->offset_list == NULL || afrp->block_size < 2) {
return eFalse;
}
offset = afrp->offset_list;
llp = pattern_list;
line_counter = 0;
line_val = afrp->line_list;
while (llp != NULL && line_val != NULL) {
while (llp != NULL && line_val != NULL
&& (offset == NULL || line_counter < offset->ival)) {
if (llp->lengthrepeats != NULL) {
s_ReportUnusedLine (line_counter,
line_counter + llp->num_appearances - 1,
line_val,
afrp->report_error,
afrp->report_error_userdata);
if (offset != afrp->offset_list) {
rval = eTrue;
}
}
line_counter += llp->num_appearances;
for (skip = 0;
skip < llp->num_appearances && line_val != NULL;
skip++) {
line_val = line_val->next;
}
llp = llp->next;
}
block_line_counter = 0;
while (block_line_counter < afrp->block_size && llp != NULL) {
block_line_counter += llp->num_appearances;
line_counter += llp->num_appearances;
for (skip = 0;
skip < llp->num_appearances && line_val != NULL;
skip++) {
line_val = line_val->next;
}
llp = llp->next;
}
if (offset != NULL) {
offset = offset->next;
}
}
return rval;
}
/* This function examines a list of line lengths, looking for interleaved
* blocks. If it finds them, it will set the SAlignRawFileData offset_list
* member variable to point to a list of locations for the blocks.
*/
static void
s_FindInterleavedBlocks
(SLengthListPtr pattern_list,
SAlignRawFilePtr afrp)
{
SLengthListPtr llp, llp_next;
TSizeInfoPtr size_list, best_ptr;
TIntLinkPtr new_offset;
int line_counter;
afrp->block_size = 0;
size_list = NULL;
afrp->offset_list = NULL;
for (llp = pattern_list; llp != NULL; llp = llp->next) {
llp_next = llp->next;
if (llp->num_appearances > 1
&& (llp_next == NULL || llp_next->lengthrepeats == NULL)) {
size_list = s_AddSizeInfo (size_list, llp->num_appearances);
}
}
best_ptr = s_GetMostPopularSizeInfo (size_list);
if (best_ptr != NULL && best_ptr->num_appearances > 1) {
afrp->block_size = best_ptr->size_value;
line_counter = 0;
for (llp = pattern_list; llp != NULL; llp = llp->next) {
llp_next = llp->next;
if (llp->num_appearances == afrp->block_size
&& (llp_next == NULL || llp_next->lengthrepeats == NULL))
{
new_offset = s_IntLinkNew (line_counter, afrp->offset_list);
if (new_offset == NULL) {
return;
}
if (afrp->offset_list == NULL) afrp->offset_list = new_offset;
}
line_counter += llp->num_appearances;
}
afrp->offset_list = s_AugmentBlockPatternOffsetList (pattern_list,
afrp->offset_list,
afrp->block_size);
}
if (s_FindUnusedLines (pattern_list, afrp)) {
s_IntLinkFree (afrp->offset_list);
afrp->offset_list = NULL;
afrp->block_size = 0;
}
s_SizeInfoFree (size_list);
}
static void s_TrimEndSpace (char *linestring)
{
int len;
char *cp;
if (linestring == NULL) return;
len = strlen (linestring);
cp = linestring + len - 1;
while (cp > linestring && (*cp == ' ' || *cp == 't' || *cp == 'r' || *cp == 'n'))
{
*cp = 0;
cp--;
}
}
static SAlignRawFilePtr
s_ReadAlignFileRaw
(FReadLineFunction readfunc,
void * userdata,
TSequenceInfoPtr sequence_info,
FReportErrorFunction errfunc,
void * errdata)
{
char * linestring;
SAlignRawFilePtr afrp;
char * tmp;
EBool found_stop;
int overall_line_count;
EBool found_expected_ntax = eFalse;
EBool found_expected_nchar = eFalse;
EBool found_char_comment = eFalse;
SLengthListPtr pattern_list = NULL;
SLengthListPtr this_pattern;
char * cp;
int len;
TIntLinkPtr new_offset;
EBool in_taxa_comment;
EBool in_bracketed_comment = eFalse;
TBracketedCommentListPtr comment_list = NULL, last_comment = NULL;
if (readfunc == NULL || sequence_info == NULL) {
return NULL;
}
afrp = s_AlignFileRawNew ();
if (afrp == NULL) {
return NULL;
}
afrp->alphabet = strdup (sequence_info->alphabet);
afrp->report_error = errfunc;
afrp->report_error_userdata = errdata;
overall_line_count = 0;
found_stop = eFalse;
in_taxa_comment = eFalse;
linestring = readfunc (userdata);
if (s_IsASN1 (linestring)) {
s_ReportASN1Error (afrp->report_error, afrp->report_error_userdata);
s_AlignFileRawFree (afrp);
return NULL;
}
while (linestring != NULL && linestring [0] != EOF) {
s_TrimEndSpace (linestring);
s_ReadOrgNamesFromText (linestring, overall_line_count, afrp);
/* we want to remove the comment from the line for the purpose
* of looking for blank lines and skipping,
* but save comments for storing in array if line is not skippable or
* blank
*/
len = strspn (linestring, " trn");
tmp = strdup (linestring + len);
if (tmp == NULL) {
return NULL;
}
if (! found_stop && ! in_taxa_comment) {
found_stop = s_FoundStopLine (tmp);
}
if (! found_stop) {
if (! found_expected_ntax || ! found_expected_nchar) {
if (s_IsTwoNumbersSeparatedBySpace (tmp)) {
s_GetFASTAExpectedNumbers (tmp, afrp);
found_expected_ntax = eTrue;
found_expected_nchar = eTrue;
} else {
s_GetNexusSizeComments (tmp, &found_expected_ntax,
&found_expected_nchar, afrp);
}
}
if (! found_char_comment) {
found_char_comment = s_CheckNexusCharInfo (tmp, sequence_info,
afrp->report_error,
afrp->report_error_userdata);
}
if (in_taxa_comment) {
if (strncmp (tmp, "end;", 4) == 0) {
in_taxa_comment = eFalse;
}
tmp [0] = 0;
} else if (strncmp (tmp, "begin taxa;", 11) == 0) {
tmp [0] = 0;
in_taxa_comment = eTrue;
}
/* remove complete single-line bracketed comments from line
*before checking for multiline bracketed comments */
s_RemoveCommentFromLine (tmp);
if (in_bracketed_comment) {
len = strspn (linestring, " trn");
if (last_comment != NULL)
{
s_BracketedCommentListAddLine (last_comment, linestring + len,
overall_line_count, len);
}
if (strchr (tmp, ']') != NULL) {
in_bracketed_comment = eFalse;
}
tmp [0] = 0;
} else if (tmp [0] == '[' && strchr (tmp, ']') == NULL) {
in_bracketed_comment = eTrue;
len = strspn (linestring, " trn");
last_comment = s_BracketedCommentListNew (comment_list,
linestring + len,
overall_line_count, len);
if (comment_list == NULL)
{
comment_list = last_comment;
}
tmp [0] = 0;
}
if (s_SkippableString (tmp)) {
tmp [0] = 0;
}
if (tmp [0] == '>' && ! found_stop) {
afrp->marked_ids = eTrue;
new_offset = s_IntLinkNew (overall_line_count + 1,
afrp->offset_list);
if (afrp->offset_list == NULL) afrp->offset_list = new_offset;
}
if (! afrp->marked_ids) {
/* add to length list for interleaved block search */
len = strcspn (tmp, " tr");
if (len > 0) {
cp = tmp + len;
len = strspn (cp, " tr");
if (len > 0) {
cp = cp + len;
}
if (*cp == 0) {
this_pattern = s_GetBlockPattern (tmp);
} else {
this_pattern = s_GetBlockPattern (cp);
}
pattern_list = s_AddPatternRepeat (pattern_list,
this_pattern);
} else {
this_pattern = s_GetBlockPattern (tmp);
pattern_list = s_AddPatternRepeat (pattern_list,
this_pattern);
}
}
len = strspn (linestring, " trn");
afrp->line_list = s_AddLineInfo (afrp->line_list,
linestring + len,
overall_line_count, len);
}
free (linestring);
free (tmp);
linestring = readfunc (userdata);
overall_line_count ++;
}
afrp->num_segments = s_GetNumSegmentsInAlignment (comment_list, errfunc, errdata);
if (afrp->num_segments > 1)
{
if (afrp->offset_list != NULL)
{
s_ReportSegmentedAlignmentError (afrp->offset_list,
errfunc, errdata);
s_AlignFileRawFree (afrp);
s_LengthListFree (pattern_list);
s_BracketedCommentListFree (comment_list);
return NULL;
}
else
{
afrp->offset_list = GetSegmentOffsetList (comment_list);
afrp->marked_ids = eTrue;
}
}
if (! afrp->marked_ids) {
s_FindInterleavedBlocks (pattern_list, afrp);
}
s_LengthListFree (pattern_list);
s_BracketedCommentListFree (comment_list);
return afrp;
}
/* This function analyzes a block to see if it contains, as the first element
* of any of its lines, one of the sequence IDs already identified. If the
* one of the lines does begin with a sequence ID, all of the lines are
* assumed to begin with sequence IDs and the function returns eTrue, otherwise
* the function returns eFalse.
*/
static EBool
s_DoesBlockHaveIds
(SAlignRawFilePtr afrp,
TLineInfoPtr first_line,
int num_lines_in_block)
{
TLineInfoPtr lip;
char * linestring;
char * this_id;
TAlignRawSeqPtr arsp;
size_t len;
int block_offset;
if (afrp->sequences == NULL) {
return eTrue;
}
for (lip = first_line, block_offset = 0;
lip != NULL && block_offset < num_lines_in_block;
lip = lip->next, block_offset++)
{
linestring = lip->data;
if (linestring != NULL) {
len = strcspn (linestring, " tr");
if (len > 0 && len < strlen (linestring)) {
this_id = (char *) malloc (len + 1);
if (this_id == NULL) {
return eFalse;
}
strncpy (this_id, linestring, len);
this_id [len] = 0;
arsp = s_FindAlignRawSeqById (afrp->sequences, this_id);
free (this_id);
if (arsp != NULL) {
return eTrue;
}
}
}
}
return eFalse;
}
/* This function analyzes the lines of the block to see if the pattern of
* the lengths of the whitespace-separated pieces of sequence data matches
* for all lines within the block. The function returns eTrue if this is so,
* otherwise the function returns eFalse.
*/
static EBool
s_BlockIsConsistent
(SAlignRawFilePtr afrp,
TLineInfoPtr first_line,
int num_lines_in_block,
EBool has_ids,
EBool first_block)
{
TLineInfoPtr lip;
SLengthListPtr list, this_pattern, best;
int len, block_offset, id_offset;
char * tmp_id;
EBool rval;
char * cp;
rval = eTrue;
list = NULL;
for (lip = first_line, block_offset = 0;
lip != NULL && block_offset < num_lines_in_block;
lip = lip->next, block_offset ++)
{
cp = lip->data;
if (has_ids) {
len = strcspn (cp, " tr");
tmp_id = (char *) malloc ( (len + 1) * sizeof (char));
if (tmp_id == NULL) {
return eFalse;
}
strncpy (tmp_id, cp, len);
tmp_id [len] = 0;
id_offset = s_FindAlignRawSeqOffsetById (afrp->sequences, tmp_id);
if (id_offset != block_offset && ! first_block) {
rval = eFalse;
s_ReportInconsistentID (tmp_id, lip->line_num,
afrp->report_error,
afrp->report_error_userdata);
}
free (tmp_id);
cp += len;
cp += strspn (cp, " tr");
}
this_pattern = s_GetBlockPattern (cp);
list = s_AddLengthList (list, this_pattern);
}
/* Now find the pattern with the most appearances */
best = NULL;
for (this_pattern = list;
this_pattern != NULL;
this_pattern = this_pattern->next)
{
if (this_pattern->num_appearances == 0) continue;
if (best == NULL
|| this_pattern->num_appearances > best->num_appearances)
{
best = this_pattern;
}
}
/* now identify and report inconsistent lines */
for (lip = first_line, block_offset = 0;
lip != NULL && block_offset < num_lines_in_block;
lip = lip->next, block_offset ++)
{
cp = lip->data;
if (has_ids) {
len = strcspn (cp, " tr");
tmp_id = (char *) malloc ( (len + 1) * sizeof (char));
if (tmp_id == NULL) {
return eFalse;
}
strncpy (tmp_id, cp, len);
tmp_id [len] = 0;
cp += len;
cp += strspn (cp, " tr");
} else {
tmp_id = s_GetAlignRawSeqIDByOffset (afrp->sequences, block_offset);
}
this_pattern = s_GetBlockPattern (cp);
if ( ! s_DoLengthPatternsMatch (this_pattern, best)) {
rval = eFalse;
s_ReportInconsistentBlockLine (tmp_id, lip->line_num,
afrp->report_error,
afrp->report_error_userdata);
}
s_LengthListFree (this_pattern);
if (has_ids) {
free (tmp_id);
}
}
s_LengthListFree (list);
return rval;
}
/* This function processes a block of lines and adds the sequence data from
* each line in the block to the appropriate sequence in the list.
*/
static void
s_ProcessBlockLines
(SAlignRawFilePtr afrp,
TLineInfoPtr lines,
int num_lines_in_block,
EBool first_block)
{
TLineInfoPtr lip;
char * linestring;
char * cp;
char * this_id;
int len;
int line_number;
EBool this_block_has_ids;
int pos;
this_block_has_ids = s_DoesBlockHaveIds (afrp, lines, num_lines_in_block);
s_BlockIsConsistent (afrp, lines, num_lines_in_block, this_block_has_ids,
first_block);
for (lip = lines, line_number = 0;
lip != NULL && line_number < num_lines_in_block;
lip = lip->next, line_number ++)
{
linestring = lip->data;
if (linestring != NULL) {
pos = 0;
if (this_block_has_ids) {
len = strcspn (linestring, " tr");
this_id = (char *) malloc (len + 1);
if (this_id == NULL) {
return;
}
strncpy (this_id, linestring, len);
this_id [len] = 0;
cp = linestring + len;
pos += len;
len = strspn (linestring, " tr");
cp += len;
pos += len;
afrp->sequences = s_AddAlignRawSeqById (afrp->sequences,
this_id, cp,
lip->line_num,
lip->line_num,
lip->line_offset + cp - linestring);
free (this_id);
} else {
if (! s_AddAlignRawSeqByIndex (afrp->sequences, line_number,
linestring,
lip->line_num, lip->line_offset))
{
s_ReportBlockLengthError ("", lip->line_num,
afrp->block_size,
line_number,
afrp->report_error,
afrp->report_error_userdata);
}
}
}
}
}
/* This function removes comments from the lines of an interleaved block of
* data.
*/
static void
s_RemoveCommentsFromBlock
(TLineInfoPtr first_line,
int num_lines_in_block)
{
TLineInfoPtr lip;
int block_offset;
for (lip = first_line, block_offset = 0;
lip != NULL && block_offset < num_lines_in_block;
lip = lip->next)
{
s_RemoveCommentFromLine (lip->data);
}
}
/* This function processes the interleaved block of data found at each
* location listed in afrp->offset_list.
*/
static void s_ProcessAlignRawFileByBlockOffsets (SAlignRawFilePtr afrp)
{
int line_counter;
TIntLinkPtr offset_ptr;
TLineInfoPtr lip;
EBool first_block = eTrue;
EBool in_taxa_comment = eFalse;
if (afrp == NULL) {
return;
}
line_counter = 0;
offset_ptr = afrp->offset_list;
lip = afrp->line_list;
while (lip != NULL && offset_ptr != NULL
&& (in_taxa_comment || ! s_FoundStopLine (lip->data))) {
if (in_taxa_comment) {
if (strncmp (lip->data, "end;", 4) == 0) {
in_taxa_comment = eFalse;
}
} else if (lip->data != NULL
&& strncmp (lip->data, "begin taxa;", 11) == 0) {
in_taxa_comment = eTrue;
}
if (line_counter == offset_ptr->ival) {
s_RemoveCommentsFromBlock (lip, afrp->block_size);
s_ProcessBlockLines (afrp, lip, afrp->block_size, first_block);
first_block = eFalse;
offset_ptr = offset_ptr->next;
}
lip = lip->next;
line_counter ++;
}
}
/* The following functions are used to analyze contiguous data. */
static void
s_CreateSequencesBasedOnTokenPatterns
(TLineInfoPtr token_list,
TIntLinkPtr offset_list,
SLengthListPtr * anchorpattern,
SAlignRawFilePtr afrp)
{
TLineInfoPtr lip;
int line_counter;
TIntLinkPtr offset_ptr, next_offset_ptr;
char * curr_id;
TSizeInfoPtr sip;
int pattern_line_counter;
int curr_seg;
if (token_list == NULL || offset_list == NULL
|| anchorpattern == NULL
|| afrp == NULL)
{
return;
}
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
{
if (anchorpattern [curr_seg] == NULL || anchorpattern [curr_seg]->lengthrepeats == NULL)
{
return;
}
}
line_counter = 0;
lip = token_list;
offset_ptr = offset_list;
curr_seg = 0;
for (offset_ptr = offset_list;
offset_ptr != NULL && lip != NULL;
offset_ptr = offset_ptr->next)
{
next_offset_ptr = offset_ptr->next;
while (line_counter < offset_ptr->ival - 1 && lip != NULL) {
lip = lip->next;
line_counter ++;
}
if (lip != NULL) {
curr_id = lip->data;
lip = lip->next;
line_counter ++;
for (sip = anchorpattern[curr_seg]->lengthrepeats;
sip != NULL
&& lip != NULL
&& (next_offset_ptr == NULL
|| line_counter < next_offset_ptr->ival - 1);
sip = sip->next)
{
for (pattern_line_counter = 0;
pattern_line_counter < sip->num_appearances
&& lip != NULL
&& (next_offset_ptr == NULL
|| line_counter < next_offset_ptr->ival - 1);
pattern_line_counter ++)
{
if ((int) strlen (lip->data) != sip->size_value) {
s_ReportLineLengthError (curr_id, lip, sip->size_value,
afrp->report_error,
afrp->report_error_userdata);
}
afrp->sequences = s_AddAlignRawSeqById (afrp->sequences,
curr_id,
lip->data,
lip->line_num,
lip->line_num,
lip->line_offset);
lip = lip->next;
line_counter ++;
}
}
if (sip != NULL && lip != NULL) {
s_ReportBlockLengthError (curr_id, lip->line_num,
afrp->block_size,
line_counter - offset_ptr->ival,
afrp->report_error,
afrp->report_error_userdata);
}
}
curr_seg ++;
if (curr_seg >= afrp->num_segments)
{
curr_seg = 0;
}
}
}
/* The following functions are used for analyzing contiguous data with
* marked IDs.
*/
/* This function creates a new LengthList pattern for each marked ID.
* After each new list is created, the function checks to see if the
* new pattern matches any pattern already in the list of patterns seen.
* If so, the function deletes the new pattern and increments
* num_appearances for the pattern in the list, otherwise the function
* adds the new pattern to the list.
* When the list is complete, the function finds the pattern with the
* most appearances and returns that pattern as the anchor pattern to use
* when checking sequence data blocks for consistency with one another.
*/
static SLengthListPtr *
s_CreateAnchorPatternForMarkedIDs
(SAlignRawFilePtr afrp)
{
SLengthListPtr * list;
SLengthListPtr * best;
SLengthListPtr this_pattern;
char * cp;
TLineInfoPtr lip;
int curr_seg;
if (afrp == NULL) {
return NULL;
}
/* initialize length lists */
list = (SLengthListPtr *) malloc (afrp->num_segments * sizeof (SLengthListPtr));
if (list == NULL)
{
return NULL;
}
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
{
list[curr_seg] = NULL;
}
/* initialize best ptrs */
/* list is one element longer, to hold null terminator */
best = (SLengthListPtr *) malloc ((afrp->num_segments + 1) * sizeof (SLengthListPtr));
if (best == NULL)
{
return NULL;
}
for (curr_seg = 0; curr_seg < afrp->num_segments + 1; curr_seg ++)
{
best[curr_seg] = NULL;
}
/* initialize pattern */
this_pattern = NULL;
curr_seg = 0;
for (lip = afrp->line_list;
lip != NULL && ! s_FoundStopLine (lip->data);
lip = lip->next)
{
if (lip->data == NULL) continue;
if (lip->data [0] == ']' || lip->data [0] == '[') continue;
if (lip->data [0] == '>') {
if (this_pattern != NULL) {
list [curr_seg] = s_AddLengthList (list [curr_seg], this_pattern);
curr_seg ++;
if (curr_seg >= afrp->num_segments)
{
curr_seg = 0;
}
}
this_pattern = s_LengthListNew (NULL);
if (this_pattern == NULL) {
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
{
s_LengthListFree (list [curr_seg]);
}
free (list);
return NULL;
}
this_pattern->num_appearances = 1;
} else if (this_pattern != NULL) {
/* This section gets rid of base pair number comments */
cp = lip->data;
while ( isspace ((int )*cp) || isdigit ((int )*cp)) {
cp++;
}
s_AddLengthRepeat (this_pattern, strlen (cp));
}
}
if (this_pattern != NULL) {
list[curr_seg] = s_AddLengthList (list [curr_seg], this_pattern);
}
/* Now find the pattern with the most appearances for each segment*/
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg++)
{
for (this_pattern = list [curr_seg];
this_pattern != NULL;
this_pattern = this_pattern->next)
{
if (this_pattern->num_appearances == 0) continue;
if (best [curr_seg] == NULL
|| this_pattern->num_appearances > best[curr_seg]->num_appearances)
{
best[curr_seg] = this_pattern;
}
}
/* free all patterns before and after anchor pattern */
if (best [curr_seg] != NULL) {
s_LengthListFree (best [curr_seg]->next);
best [curr_seg]->next = NULL;
}
if (best [curr_seg] != list [curr_seg]) {
this_pattern = list [curr_seg];
while ( this_pattern != NULL && this_pattern->next != best[curr_seg] ) {
this_pattern = this_pattern->next;
}
if (this_pattern != NULL) {
this_pattern->next = NULL;
s_LengthListFree (list [curr_seg]);
}
}
}
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
{
if (best[curr_seg] == NULL)
{
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
{
s_LengthListFree (best [curr_seg]);
}
return NULL;
}
}
return best;
}
/* This function removes base pair count comments from the data sections
* for contiguous marked ID sequences.
*/
static void s_RemoveBasePairCountCommentsFromData (SAlignRawFilePtr afrp)
{
TIntLinkPtr this_offset, next_offset;
TLineInfoPtr lip;
int line_count;
char * cp;
if (afrp == NULL || afrp->offset_list == NULL) {
return;
}
this_offset = afrp->offset_list;
next_offset = this_offset->next;
lip = afrp->line_list;
line_count = 0;
while (lip != NULL && this_offset != NULL) {
if (line_count == this_offset->ival) {
while (lip != NULL &&
(next_offset == NULL
|| line_count < next_offset->ival - 1)) {
cp = lip->data;
if (cp != NULL) {
cp += strspn (cp, " trn1234567890");
if (cp != lip->data) {
strcpy (lip->data, cp);
}
}
line_count ++;
lip = lip->next;
}
this_offset = this_offset->next;
if (this_offset != NULL) {
next_offset = this_offset->next;
}
} else {
line_count ++;
lip = lip->next;
}
}
}
/* This function assumes that the offset_list has already been populated
* with the locations of the data blocks. It analyzes the blocks of data
* to find the most frequently occurring pattern of lengths of data and then
* uses that pattern to attach the data to the correct IDs and report any
* errors in formatting.
*/
static void s_ProcessAlignFileRawForMarkedIDs (SAlignRawFilePtr afrp)
{
SLengthListPtr * anchorpattern;
if (afrp == NULL) {
return;
}
s_RemoveBasePairCountCommentsFromData (afrp);
anchorpattern = s_CreateAnchorPatternForMarkedIDs (afrp);
if (anchorpattern == NULL || afrp->offset_list == NULL) {
return;
}
s_CreateSequencesBasedOnTokenPatterns (afrp->line_list, afrp->offset_list,
anchorpattern, afrp);
}
/* The following functions are used for analyzing contiguous sequence data
* without marked IDs.
*/
/* This function left-shifts a string, character by character. */
static void
s_StringLeftShift
(char * cp_from,
char * cp_to)
{
if (cp_from == cp_to || cp_from == NULL || cp_to == NULL) {
return;
}
while (*cp_to != 0) {
*cp_from = *cp_to;
cp_from++;
cp_to++;
}
*cp_from = 0;
}
/* This function removes bracketed comments from a linked list of
* SLineInfo structures. The function returns a pointer to the
* list without the comments.
*/
static TLineInfoPtr s_RemoveCommentsFromTokens (TLineInfoPtr list)
{
TLineInfoPtr lip;
int num_comment_starts;
char * cp_l;
char * cp_r;
char * cp;
EBool in_comment;
num_comment_starts = 0;
in_comment = eFalse;
for (lip = list; lip != NULL; lip = lip->next) {
if (lip->data == NULL) {
lip->delete_me = eTrue;
} else {
cp_l = NULL;
cp_r = NULL;
for (cp = lip->data; *cp != 0; cp++) {
if (*cp == ']') {
if (cp_r == NULL) {
s_StringLeftShift (lip->data, cp + 1);
cp = lip->data - 1;
} else {
s_StringLeftShift (cp_r, cp + 1);
cp = cp_r;
if (cp_r > lip->data) {
cp_r --;
while (cp_r >= lip->data && *cp_r != '[') {
cp_r --;
}
if (cp_r < lip->data) {
cp_r = NULL;
}
} else {
cp_r = NULL;
}
}
if (num_comment_starts > 0) {
num_comment_starts --;
}
} else if (*cp == '[') {
cp_r = cp;
num_comment_starts ++;
}
}
if (in_comment) {
if (num_comment_starts == 0) {
in_comment = eFalse;
} else {
lip->delete_me = eTrue;
}
} else if (num_comment_starts > 0) {
cp_r = strchr (lip->data, '[');
if (cp_r != NULL) {
*cp_r = 0;
}
in_comment = eTrue;
}
if (lip->data [0] == 0) {
lip->delete_me = eTrue;
}
}
}
list = s_DeleteLineInfos (list);
return list;
}
/* This function removes Nexus comments from a linked list of SLineInfo
* structures. The function returns a pointer to the list without the
* comments.
*/
static TLineInfoPtr s_RemoveNexusCommentsFromTokens (TLineInfoPtr list)
{
TLineInfoPtr lip, start_lip, end_lip;
lip = list;
start_lip = NULL;
end_lip = NULL;
while (lip != NULL) {
if (s_StringICmp (lip->data, "#NEXUS") == 0) {
start_lip = lip;
end_lip = lip;
while (end_lip != NULL
&& s_StringICmp (end_lip->data, "matrix") != 0) {
end_lip = end_lip->next;
}
if (end_lip != NULL) {
while (start_lip != end_lip) {
start_lip->delete_me = eTrue;
start_lip = start_lip->next;
}
end_lip->delete_me = eTrue;
lip = end_lip->next;
} else {
lip = lip->next;
}
} else {
lip = lip->next;
}
}
list = s_DeleteLineInfos (list);
return list;
}
/* This function finds the number of characters that occur most frequently
* in a token and returns a pointer to a SSizeInfo structure that
* describes the most common length and the number of times it appears.
*/
static TSizeInfoPtr
s_FindMostFrequentlyOccurringTokenLength
(TSizeInfoPtr list,
int not_this_size)
{
TSizeInfoPtr list_ptr, new_list, best_ptr, return_best;
new_list = NULL;
for (list_ptr = list; list_ptr != NULL; list_ptr = list_ptr->next) {
if (not_this_size != list_ptr->size_value) {
new_list = s_AddSizeInfoAppearances (new_list,
list_ptr->size_value,
list_ptr->num_appearances);
}
}
best_ptr = s_GetMostPopularSizeInfo (new_list);
return_best = NULL;
if (best_ptr != NULL) {
return_best = s_SizeInfoNew (NULL);
if (return_best != NULL) {
return_best->size_value = best_ptr->size_value;
return_best->num_appearances = best_ptr->num_appearances;
}
}
s_SizeInfoFree (new_list);
return return_best;
}
/* This function examines all instances of an anchor pattern in the data
* and checks to see if the line immediately after the anchor pattern should
* be used as part of the anchor pattern. This function exists because
* frequently, but not always, contiguous data will consist of multiple lines
* of data of the same length (for example, 80 characters), followed by one
* shorter line with the remaining data. We must also make sure that we do
* not accidentally include the ID of the next sequence in the data of the
* previous sequence.
*/
static void
s_ExtendAnchorPattern
(SLengthListPtr anchorpattern,
TSizeInfoPtr line_lengths)
{
TSizeInfoPtr last_line_lengths, sip, sip_next, twoafter;
int best_last_line_length;
int anchor_line_length;
if (anchorpattern == NULL
|| anchorpattern->lengthrepeats == NULL
|| line_lengths == NULL) {
return;
}
last_line_lengths = NULL;
anchor_line_length = anchorpattern->lengthrepeats->size_value;
/* also check to make sure that there's more than one line between
* this pattern and the next pattern, otherwise the next line is the
* ID for the next pattern and shouldn't be included in the anchor
*/
for (sip = line_lengths; sip != NULL; sip = sip->next) {
if (s_SizeInfoIsEqual (sip, anchorpattern->lengthrepeats)) {
sip_next = sip->next;
if (sip_next != NULL
&& sip_next->size_value > 0
&& sip_next->size_value != anchor_line_length
&& ((twoafter = sip_next->next) == NULL
|| twoafter->size_value != anchor_line_length))
{
last_line_lengths = s_AddSizeInfo (last_line_lengths,
sip_next->size_value);
}
}
}
best_last_line_length = s_GetMostPopularSize (last_line_lengths);
if (best_last_line_length > 0) {
s_AddLengthRepeat (anchorpattern, best_last_line_length);
}
s_SizeInfoFree (last_line_lengths);
}
/* This function looks for the most frequently occurring pattern, where a
* pattern is considered to be N contiguous tokens of M characters. The
* function then checks to see if there is usually a token of a particular
* length that immediately follows this pattern that is not the ID for the
* next sequence. If so, this line length is added to the pattern.
* The function returns a pointer to this pattern.
*/
static SLengthListPtr s_FindMostPopularPattern (TSizeInfoPtr list)
{
SLengthListPtr patternlist, newpattern;
TSizeInfoPtr sip, popular_line_length;
SLengthListPtr index, best;
int not_this_length;
patternlist = NULL;
for (sip = list; sip != NULL; sip = sip->next) {
if (sip->size_value > 0) {
newpattern = s_LengthListNew (NULL);
if (newpattern == NULL) {
s_LengthListFree (patternlist);
return NULL;
}
newpattern->num_appearances = 1;
newpattern->lengthrepeats = s_SizeInfoNew (NULL);
if (newpattern->lengthrepeats == NULL) {
s_LengthListFree (patternlist);
return NULL;
}
newpattern->lengthrepeats->size_value = sip->size_value;
newpattern->lengthrepeats->num_appearances = sip->num_appearances;
patternlist = s_AddLengthList (patternlist, newpattern);
}
}
if (patternlist == NULL) {
return NULL;
}
best = NULL;
for (index = patternlist; index != NULL; index = index->next) {
if (index->lengthrepeats->num_appearances < 2) {
continue;
}
if (best==NULL || best->num_appearances < index->num_appearances) {
best = index;
} else if (best->num_appearances == index->num_appearances
&& best->lengthrepeats->size_value <
index->lengthrepeats->size_value) {
best = index;
}
}
/* Free data in list before best pattern */
index = patternlist;
while ( index != NULL && index->next != best ) {
index = index->next;
}
if (index != NULL) {
index->next = NULL;
s_LengthListFree (patternlist);
}
/* Free data in list after best pattern */
if (best != NULL) {
s_LengthListFree (best->next);
best->next = NULL;
}
popular_line_length = s_FindMostFrequentlyOccurringTokenLength (list, 0);
if (best != NULL && best->lengthrepeats != NULL
&& popular_line_length != NULL
&& best->lengthrepeats->size_value == popular_line_length->size_value)
{
not_this_length = popular_line_length->size_value;
s_SizeInfoFree (popular_line_length);
popular_line_length = s_FindMostFrequentlyOccurringTokenLength (list,
not_this_length);
}
if (best == NULL
|| (popular_line_length != NULL
&& popular_line_length->size_value > best->lengthrepeats->size_value
&& popular_line_length->num_appearances > best->num_appearances))
{
if (best == NULL) {
best = s_LengthListNew (NULL);
if (best == NULL) {
return NULL;
}
}
best->lengthrepeats = s_SizeInfoNew (NULL);
if (best->lengthrepeats == NULL) {
return NULL;
}
best->lengthrepeats->size_value = popular_line_length->size_value;
best->lengthrepeats->num_appearances = 1;
} else {
/* extend anchor pattern to include best length of last line */
s_ExtendAnchorPattern (best, list);
}
s_SizeInfoFree (popular_line_length);
return best;
}
/* This function creates an SIntLink list to describe the locations
* of occurrences of the anchorpattern in the SSizeInfo list.
* The function returns a pointer to the SIntLink list.
*/
static TIntLinkPtr
s_CreateOffsetList
(TSizeInfoPtr list,
SLengthListPtr anchorpattern)
{
int line_counter;
TIntLinkPtr offset_list, new_offset;
TSizeInfoPtr sip, prev_sip;
if (list == NULL || anchorpattern == NULL) {
return NULL;
}
line_counter = 0;
offset_list = NULL;
prev_sip = NULL;
for (sip = list; sip != NULL; sip = sip->next) {
if (s_SizeInfoIsEqual (sip, anchorpattern->lengthrepeats)) {
new_offset = s_IntLinkNew (line_counter, offset_list);
if (new_offset == NULL) {
s_IntLinkFree (offset_list);
return NULL;
}
if (offset_list == NULL) {
offset_list = new_offset;
}
}
line_counter += sip->num_appearances;
prev_sip = sip;
}
return offset_list;
}
/* This function determines whether or not the number of expected sequence
* characters are available starting at a token after line_start and stopping
* at least one token before the next known sequence data block in the list.
* If so, the function returns the number of the token at which the sequence
* data begins. Otherwise the function returns -1.
*/
static int
s_ForecastPattern
(int line_start,
int pattern_length,
TIntLinkPtr next_offset,
int sip_offset,
TSizeInfoPtr list)
{
int offset, end_offset;
TSizeInfoPtr sip;
int line_counter, num_chars;
if (list == NULL) {
return -1;
}
for (offset = sip_offset; offset < list->num_appearances; offset++) {
line_counter = line_start + offset;
num_chars = list->size_value * (list->num_appearances - offset);
sip = list;
while (num_chars < pattern_length
&& (next_offset == NULL || line_counter < next_offset->ival)
&& sip->next != NULL)
{
sip = sip->next;
for (end_offset = 0;
end_offset < sip->num_appearances
&& num_chars < pattern_length
&& (next_offset == NULL
|| line_counter < next_offset->ival);
end_offset ++)
{
num_chars += sip->size_value;
line_counter ++;
}
}
if (num_chars == pattern_length) {
return line_start + offset;
}
}
return -1;
}
/* This function examines the offset list and searches for holes where blocks
* of sequence data without the exact expected formatting might exist. The
* function adds the offsets of any new blocks to the list and returns a
* pointer to the augmented offset list.
*/
static TIntLinkPtr
s_AugmentOffsetList
(TIntLinkPtr offset_list,
TSizeInfoPtr list,
SLengthListPtr anchorpattern)
{
int pattern_length;
TSizeInfoPtr sip;
TIntLinkPtr prev_offset, next_offset, new_offset;
int line_counter, forecast_position, line_skip;
EBool skipped_previous = eFalse;
int num_chars;
if (list == NULL || anchorpattern == NULL) {
return offset_list;
}
pattern_length = 0;
for (sip = anchorpattern->lengthrepeats; sip != NULL; sip = sip->next) {
pattern_length += (sip->size_value * sip->num_appearances);
}
if (pattern_length == 0) {
return offset_list;
}
prev_offset = NULL;
next_offset = offset_list;
line_counter = 0;
sip = list;
while (sip != NULL) {
/* if we are somehow out of synch, don't get caught in infinite loop */
if (next_offset != NULL && line_counter > next_offset->ival) {
next_offset = next_offset->next;
} else if (next_offset != NULL && line_counter == next_offset->ival) {
skipped_previous = eFalse;
prev_offset = next_offset;
next_offset = next_offset->next;
/* advance sip and line counter past the end of this pattern */
num_chars = 0;
while (num_chars < pattern_length && sip != NULL) {
num_chars += sip->size_value * sip->num_appearances;
line_counter += sip->num_appearances;
sip = sip->next;
}
} else if (skipped_previous) {
line_skip = 0;
while (sip != NULL && line_skip < sip->num_appearances
&& (next_offset == NULL
|| line_counter < next_offset->ival)) {
/* see if we can build a pattern that matches the pattern
* length we want
*/
forecast_position = s_ForecastPattern (line_counter,
pattern_length,
next_offset, line_skip,
sip);
if (forecast_position > 0) {
new_offset = s_IntLinkNew (forecast_position, NULL);
if (new_offset == NULL) {
return NULL;
}
if (prev_offset == NULL) {
new_offset->next = offset_list;
offset_list = new_offset;
} else {
new_offset->next = next_offset;
prev_offset->next = new_offset;
}
prev_offset = new_offset;
/* now advance sip and line counter past the end
* of the pattern we have just created
*/
num_chars = 0;
while (num_chars < pattern_length && sip != NULL) {
for (line_skip = 0;
line_skip < sip->num_appearances
&& num_chars < pattern_length;
line_skip++)
{
num_chars += sip->size_value;
line_counter ++;
}
if (line_skip == sip->num_appearances) {
sip = sip->next;
line_skip = 0;
}
}
} else {
line_counter += sip->num_appearances;
sip = sip->next;
line_skip = 0;
}
}
} else {
skipped_previous = eTrue;
line_counter += sip->num_appearances;
sip = sip->next;
}
}
return offset_list;
}
/* This function finds the most frequently occurring distance between
* two sequence data blocks and returns that value.
*/
static int s_GetMostPopularPatternLength (TIntLinkPtr offset_list)
{
int line_counter, best_length;
TSizeInfoPtr pattern_length_list;
TIntLinkPtr offset;
if (offset_list == NULL) {
return -1;
}
line_counter = -1;
pattern_length_list = NULL;
for (offset = offset_list; offset != NULL; offset = offset->next) {
if (line_counter != -1) {
pattern_length_list = s_AddSizeInfo (pattern_length_list,
offset->ival - line_counter);
}
line_counter = offset->ival;
}
best_length = s_GetMostPopularSize (pattern_length_list);
s_SizeInfoFree (pattern_length_list);
return best_length;
}
/* This function finds the most frequently appearing number of characters
* in a block of sequence data and returns that value.
*/
static int
s_GetBestCharacterLength
(TLineInfoPtr token_list,
TIntLinkPtr offset_list,
int block_length)
{
TLineInfoPtr lip;
TIntLinkPtr prev_offset, new_offset;
int line_diff, num_chars, best_num_chars;
TSizeInfoPtr pattern_length_list = NULL;
if (token_list == NULL || offset_list == NULL || block_length < 1) {
return -1;
}
/* get length of well-formatted block size */
lip = token_list;
prev_offset = NULL;
for (new_offset = offset_list;
new_offset != NULL && lip != NULL;
new_offset = new_offset->next)
{
if (prev_offset == NULL) {
/* skip first tokens */
for (line_diff = 0;
line_diff < new_offset->ival && lip != NULL;
line_diff ++)
{
lip = lip->next;
}
}
if (prev_offset != NULL) {
num_chars = 0;
for (line_diff = 0;
line_diff < new_offset->ival - prev_offset->ival
&& lip != NULL;
line_diff ++)
{
if (line_diff < new_offset->ival - prev_offset->ival - 1) {
num_chars += strlen (lip->data);
}
lip = lip->next;
}
if (new_offset->ival - prev_offset->ival == block_length) {
pattern_length_list = s_AddSizeInfo (pattern_length_list,
num_chars);
}
}
prev_offset = new_offset;
}
best_num_chars = s_GetMostPopularSize (pattern_length_list);
if (best_num_chars == 0 && pattern_length_list != NULL) {
best_num_chars = pattern_length_list->size_value;
}
s_SizeInfoFree (pattern_length_list);
pattern_length_list = NULL;
return best_num_chars;
}
static int
s_CountCharactersBetweenOffsets
(TLineInfoPtr list,
int distance,
int desired_num_chars)
{
int line_diff, num_chars, total_chars, pattern_length, num_starts;
TLineInfoPtr lip;
TIntLinkPtr length_list, start_list, start_ptr, length;
int start_of_unknown;
int num_additional_offsets_needed;
if (list == NULL || distance == 0 || desired_num_chars == 0) {
return 0;
}
/* because the first offset is the start of a known pattern, we should
* skip to the end of that pattern and start looking for additional
* offsets
*/
total_chars = 0;
for (lip = list, line_diff = 0;
lip != NULL && line_diff < distance
&& total_chars < desired_num_chars;
lip = lip->next, line_diff++) {
num_chars = strlen (lip->data);
total_chars += num_chars;
}
while (lip != NULL && line_diff < distance && s_IsBlank (lip->data)) {
lip = lip->next;
line_diff ++;
}
/* skip over line we would need for ID */
if (lip != NULL) {
lip = lip->next;
line_diff ++;
}
if (lip == NULL || line_diff == distance) {
return 0;
}
num_starts = 1;
list = lip->next;
start_of_unknown = line_diff;
length_list = NULL;
total_chars = 0;
for (lip = list;
lip != NULL && line_diff < distance;
lip = lip->next, line_diff++)
{
num_chars = strlen (lip->data);
length = s_IntLinkNew (num_chars, length_list);
if (length_list == NULL) {
length_list = length;
}
total_chars += num_chars;
}
/* how many offsets do we need? */
num_additional_offsets_needed = (total_chars / desired_num_chars);
if (num_additional_offsets_needed == 0) {
return 0;
}
/* Find all the places you could start and get the exact right number
* of characters
*/
start_list = NULL;
num_starts = 0;
pattern_length = 0;
for (start_ptr = length_list, line_diff = start_of_unknown;
start_ptr != NULL && line_diff < distance
&& pattern_length < distance - line_diff ;
start_ptr = start_ptr->next, line_diff++) {
num_chars = start_ptr->ival;
pattern_length = 1;
length = start_ptr->next;
while (num_chars < desired_num_chars
&& pattern_length + line_diff < distance
&& length != NULL)
{
num_chars += length->ival;
pattern_length ++;
length = length->next;
}
if (num_chars == desired_num_chars) {
length = s_IntLinkNew (line_diff, start_list);
if (start_list == NULL) {
start_list = length;
}
num_starts ++;
}
}
/* now select best set of start points */
s_IntLinkFree (length_list);
s_IntLinkFree (start_list);
return 0;
}
/* This function inserts new block locations into the offset_list
* by looking for likely starts of abnormal patterns.
*/
static void s_InsertNewOffsets
(TLineInfoPtr token_list,
TIntLinkPtr offset_list,
int block_length,
int best_num_chars,
char * alphabet)
{
TLineInfoPtr lip, prev_start;
TIntLinkPtr prev_offset, new_offset, splice_offset;
int line_diff, num_chars, line_start;
if (token_list == NULL || offset_list == NULL
|| block_length < 1 || best_num_chars < 1)
{
return;
}
lip = token_list;
prev_offset = NULL;
for (new_offset = offset_list;
new_offset != NULL && lip != NULL;
new_offset = new_offset->next) {
if (prev_offset == NULL) {
/* just advance through tokens */
for (line_diff = 0;
line_diff < new_offset->ival && lip != NULL;
line_diff ++) {
lip = lip->next;
}
} else {
if (new_offset->ival - prev_offset->ival == block_length) {
/* just advance through tokens */
for (line_diff = 0;
line_diff < new_offset->ival - prev_offset->ival
&& lip != NULL;
line_diff ++) {
lip = lip->next;
}
} else {
/* look for intermediate breaks */
prev_start = lip;
num_chars = 0;
for (line_diff = 0;
line_diff < new_offset->ival - prev_offset->ival
&& lip != NULL && num_chars < best_num_chars;
line_diff ++) {
num_chars += strlen (lip->data);
lip = lip->next;
}
if (lip == NULL) {
return;
}
/* set new offset at first line of next pattern */
line_diff ++;
lip = lip->next;
if (line_diff < new_offset->ival - prev_offset->ival) {
line_start = line_diff + prev_offset->ival;
/* advance token pointer to new piece */
while (line_diff < new_offset->ival - prev_offset->ival
&& lip != NULL)
{
lip = lip->next;
line_diff ++;
}
/* insert new offset value */
splice_offset = s_IntLinkNew (line_start, NULL);
if (splice_offset == NULL) {
return;
}
splice_offset->next = new_offset;
prev_offset->next = splice_offset;
s_CountCharactersBetweenOffsets (lip,
new_offset->ival - splice_offset->ival,
best_num_chars);
}
}
}
prev_offset = new_offset;
}
/* iterate through the last block */
for (line_diff = 0;
line_diff < block_length && lip != NULL;
line_diff ++) {
lip = lip->next;
}
/* if we have room for one more sequence, or even most of one more sequence, add it */
if (lip != NULL && ! s_SkippableString (lip->data)) {
splice_offset = s_IntLinkNew (line_diff + prev_offset->ival, prev_offset);
}
}
/* This function returns true if the string contains digits, false otherwise */
static EBool s_ContainsDigits (char *data)
{
char *cp;
if (data == NULL) return eFalse;
for (cp = data; *cp != 0; cp++) {
if (isdigit (*cp)) {
return eTrue;
}
}
return eFalse;
}
/* This function processes the alignment file data by dividing the original
* lines into pieces based on whitespace and looking for patterns of length
* in the data.
*/
static void s_ProcessAlignFileRawByLengthPattern (SAlignRawFilePtr afrp)
{
TLineInfoPtr token_list;
SLengthListPtr list;
TLineInfoPtr lip;
SLengthListPtr anchorpattern[2];
TIntLinkPtr offset_list;
int best_length;
int best_num_chars;
if (afrp == NULL || afrp->line_list == NULL) {
return;
}
token_list = s_BuildTokenList (afrp->line_list);
token_list = s_RemoveCommentsFromTokens (token_list);
token_list = s_RemoveNexusCommentsFromTokens (token_list);
list = s_LengthListNew ( NULL );
for (lip = token_list;
lip != NULL && ! s_FoundStopLine (lip->data);
lip = lip->next)
{
if (s_SkippableString (lip->data) || s_ContainsDigits(lip->data)) {
s_AddLengthRepeat (list, 0);
} else {
s_AddLengthRepeat (list, strlen (lip->data));
}
}
anchorpattern [0] = s_FindMostPopularPattern (list->lengthrepeats);
anchorpattern [1] = NULL;
if (anchorpattern [0] == NULL || anchorpattern[0]->lengthrepeats == NULL) {
return;
}
/* find anchor patterns in original list,
* find distances between anchor patterns
*/
offset_list = s_CreateOffsetList (list->lengthrepeats, anchorpattern[0]);
offset_list = s_AugmentOffsetList (offset_list,
list->lengthrepeats,
anchorpattern[0]);
/* resolve unusual distances between anchor patterns */
best_length = s_GetMostPopularPatternLength (offset_list);
if (best_length < 1 && offset_list != NULL && offset_list->next != NULL) {
best_length = offset_list->next->ival - offset_list->ival;
}
best_num_chars = s_GetBestCharacterLength (token_list, offset_list,
best_length);
s_InsertNewOffsets (token_list, offset_list, best_length, best_num_chars,
afrp->alphabet);
/* use token before each anchor pattern as ID, use tokens for distance
* between anchor patterns for sequence data
*/
s_CreateSequencesBasedOnTokenPatterns (token_list, offset_list,
anchorpattern, afrp);
s_LengthListFree (anchorpattern[0]);
s_LengthListFree (list);
s_LineInfoFree (token_list);
}
/* The following functions are used to convert data from the internal
* representation into the form that will be passed to the calling
* program. Information from the ID strings is parsed to remove
* definition lines and organism information, the gap characters are
* standardized to '-', the missing characters are standardizes to 'N',
* match characters are replaced with characters from the first record,
* and bad characters are reported.
*/
/* This function allocates memory for a new AligmentFileData structure
* and initializes its member variables.
*/
extern TAlignmentFilePtr AlignmentFileNew (void)
{
TAlignmentFilePtr afp;
afp = (TAlignmentFilePtr) malloc (sizeof (SAlignmentFile));
if (afp == NULL) {
return NULL;
}
afp->num_sequences = 0;
afp->num_organisms = 0;
afp->num_deflines = 0;
afp->num_segments = 0;
afp->ids = NULL;
afp->sequences = NULL;
afp->organisms = NULL;
afp->deflines = NULL;
return afp;
}
/* This function frees the memory associated with an AligmentFileData
* structure and its member variables.
*/
extern void AlignmentFileFree (TAlignmentFilePtr afp)
{
int index;
if (afp == NULL) {
return;
}
if (afp->ids != NULL) {
for (index = 0; index < afp->num_sequences; index++) {
free (afp->ids [index]);
}
free (afp->ids);
afp->ids = NULL;
}
if (afp->sequences != NULL) {
for (index = 0; index < afp->num_sequences; index++) {
free (afp->sequences [index]);
}
free (afp->sequences);
afp->sequences = NULL;
}
if (afp->organisms != NULL) {
for (index = 0; index < afp->num_organisms; index++) {
free (afp->organisms [index]);
}
free (afp->organisms);
afp->sequences = NULL;
}
if (afp->deflines != NULL) {
for (index = 0; index < afp->num_deflines; index++) {
free (afp->deflines [index]);
}
free (afp->deflines);
afp->deflines = NULL;
}
free (afp);
}
/* This function parses the identifier string used by the alignment file
* to identify a sequence to find the portion of the string that is actually
* an ID, as opposed to organism information or definition line.
*/
static char * s_GetIdFromString (char * str)
{
char * cp;
char * id;
int len;
if (str == NULL) {
return NULL;
}
cp = str;
cp += strspn (str, " >t");
len = strcspn (cp, " trn");
if (len == 0) {
return NULL;
}
id = malloc (len + 1);
if (id == NULL) {
return NULL;
}
strncpy (id, cp, len);
id [ len ] = 0;
return id;
}
/* This function pulls defline information from the ID string, if there is
* any.
*/
static char * s_GetDeflineFromIdString (char * str)
{
char * cp;
int len;
if (str == NULL) {
return NULL;
}
cp = str;
cp += strspn (str, " >t");
len = strcspn (cp, " trn");
if (len == 0) {
return NULL;
}
cp += len;
len = strspn (cp, " trn");
if (len == 0) {
return NULL;
}
cp += len;
if (*cp == 0) {
return NULL;
}
return strdup (cp);
}
/* This function takes the ID strings read from the file and parses them
* to obtain a defline (if there is extra text after the ID and/or
* organism information) and to obtain the actual ID for the sequence.
*/
static EBool s_ReprocessIds (SAlignRawFilePtr afrp)
{
TStringCountPtr list, scp;
TAlignRawSeqPtr arsp;
TLineInfoPtr lip;
char * id;
int line_num;
EBool rval = eTrue;
char * defline;
if (afrp == NULL) {
return eFalse;
}
list = NULL;
lip = afrp->deflines;
for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
if (arsp->id_lines != NULL) {
line_num = arsp->id_lines->ival;
} else {
line_num = -1;
}
s_RemoveOrganismCommentFromLine (arsp->id);
id = s_GetIdFromString (arsp->id);
if (lip == NULL) {
defline = s_GetDeflineFromIdString (arsp->id);
afrp->deflines = s_AddLineInfo (afrp->deflines, defline,
line_num, 0);
free (defline);
afrp->num_deflines ++;
}
free (arsp->id);
arsp->id = id;
list = s_AddStringCount (arsp->id, line_num, list);
}
for (scp = list; scp != NULL; scp = scp->next) {
if (scp->num_appearances > 1) {
rval = eFalse;
s_ReportRepeatedId (scp, afrp->report_error,
afrp->report_error_userdata);
}
}
return rval;
}
/* This function reports unacceptable characters in a sequence. Frequently
* there will be more than one character of the same kind (for instance,
* when the user has incorrectly specified a gap character), so repeated
* characters are reported together. The function advances the data
* position in the SLineInfoReader structure lirp, and returns the
* current data position for lirp.
*/
static int
s_ReportRepeatedBadCharsInSequence
(TLineInfoReaderPtr lirp,
char * id,
char * reason,
FReportErrorFunction report_error,
void * report_error_userdata)
{
int bad_line_num, bad_line_offset;
int num_bad_chars;
char bad_char, curr_char;
int data_position;
bad_line_num = s_LineInfoReaderGetCurrentLineNumber (lirp);
bad_line_offset = s_LineInfoReaderGetCurrentLineOffset (lirp);
bad_char = *lirp->curr_line_pos;
num_bad_chars = 1;
data_position = lirp->data_pos + 1;
while ((curr_char = s_FindNthDataChar (lirp, data_position)) == bad_char) {
num_bad_chars ++;
data_position ++;
}
s_ReportBadCharError (id, bad_char, num_bad_chars,
bad_line_offset, bad_line_num, reason,
report_error, report_error_userdata);
return data_position;
}
/* This function does context-sensitive replacement of the missing,
* match, and gap characters and also identifies bad characters.
* Gap characters found in the wrong location in the sequence are
* considered an error. Characters that are not missing, match, or
* gap characters and are not in the specified sequence alphabet are
* reported as errors. Match characters in the first sequence are also
* reported as errors.
* The function will return eTrue if any errors were found, or eFalse
* if there were no errors.
*/
static EBool
s_FindBadDataCharsInSequence
(TAlignRawSeqPtr arsp,
TAlignRawSeqPtr master_arsp,
TSequenceInfoPtr sip,
FReportErrorFunction report_error,
void * report_error_userdata)
{
TLineInfoReaderPtr lirp, master_lirp;
int data_position;
int middle_start;
int middle_end;
char curr_char, master_char;
EBool found_middle_start;
EBool rval = eFalse;
EBool match_not_in_beginning_gap;
EBool match_not_in_end_gap;
if (arsp == NULL || master_arsp == NULL || sip == NULL) {
return eTrue;
}
lirp = s_LineInfoReaderNew (arsp->sequence_data);
if (lirp == NULL) {
return eTrue;
}
if (arsp != master_arsp) {
master_lirp = s_LineInfoReaderNew (master_arsp->sequence_data);
if (master_lirp == NULL) {
s_LineInfoReaderFree (lirp);
return eTrue;
}
} else {
master_lirp = NULL;
}
if (strcspn (sip->beginning_gap, sip->match)
== strlen (sip->beginning_gap)) {
match_not_in_beginning_gap = eTrue;
} else {
match_not_in_beginning_gap = eFalse;
}
if (strcspn (sip->end_gap, sip->match) == strlen (sip->end_gap)) {
match_not_in_end_gap = eTrue;
} else {
match_not_in_end_gap = eFalse;
}
/* First, find middle start and end positions and report characters
* that are not beginning gap before the middle
*/
found_middle_start = eFalse;
data_position = 0;
curr_char = s_FindNthDataChar (lirp, data_position);
while (curr_char != 0) {
if (strchr (sip->alphabet, curr_char) != NULL) {
if (! found_middle_start) {
middle_start = data_position;
found_middle_start = eTrue;
}
middle_end = data_position + 1;
data_position ++;
} else if (! found_middle_start) {
if (match_not_in_beginning_gap
&& strchr (sip->match, curr_char) != NULL)
{
middle_start = data_position;
found_middle_start = eTrue;
middle_end = data_position + 1;
data_position ++;
} else if (strchr (sip->beginning_gap, curr_char) == NULL) {
/* Report error - found character that is not beginning gap
in beginning gap */
data_position = s_ReportRepeatedBadCharsInSequence (lirp,
arsp->id,
"expect only beginning gap characters here",
report_error, report_error_userdata);
rval = eTrue;
} else {
*lirp->curr_line_pos = '-';
data_position ++;
}
} else {
if (match_not_in_end_gap
&& strchr (sip->match, curr_char) != NULL)
{
middle_end = data_position + 1;
}
data_position ++;
}
curr_char = s_FindNthDataChar (lirp, data_position);
}
if (! found_middle_start) {
s_ReportMissingSequenceData (arsp->id,
report_error, report_error_userdata);
s_LineInfoReaderFree (lirp);
return eTrue;
}
/* Now complain about bad middle characters */
data_position = middle_start;
while (data_position < middle_end)
{
curr_char = s_FindNthDataChar (lirp, data_position);
while (data_position < middle_end
&& strchr (sip->alphabet, curr_char) != NULL) {
data_position ++;
curr_char = s_FindNthDataChar (lirp, data_position);
}
if (curr_char == 0 || data_position >= middle_end) {
/* do nothing, done with middle */
} else if (strchr (sip->missing, curr_char) != NULL) {
*lirp->curr_line_pos = 'N';
data_position ++;
} else if (strchr (sip->match, curr_char) != NULL) {
master_char = s_FindNthDataChar (master_lirp, data_position);
if (master_char == 0) {
/* report error - unable to get master char */
if (master_arsp == arsp) {
data_position = s_ReportRepeatedBadCharsInSequence (lirp,
arsp->id,
"can't specify match chars in first sequence",
report_error, report_error_userdata);
} else {
data_position = s_ReportRepeatedBadCharsInSequence (lirp,
arsp->id,
"can't find source for match chars",
report_error, report_error_userdata);
}
rval = eTrue;
} else {
*lirp->curr_line_pos = master_char;
data_position ++;
}
} else if (strchr (sip->middle_gap, curr_char) != NULL) {
*lirp->curr_line_pos = '-';
data_position ++;
} else {
/* Report error - found bad character in middle */
data_position = s_ReportRepeatedBadCharsInSequence (lirp,
arsp->id,
"expect only sequence, missing, match,"
" and middle gap characters here",
report_error, report_error_userdata);
rval = eTrue;
}
}
/* Now find and complain about end characters */
data_position = middle_end;
curr_char = s_FindNthDataChar (lirp, data_position);
while (curr_char != 0) {
if (strchr (sip->end_gap, curr_char) == NULL) {
/* Report error - found bad character in middle */
data_position = s_ReportRepeatedBadCharsInSequence (lirp, arsp->id,
"expect only end gap characters here",
report_error, report_error_userdata);
rval = eTrue;
} else {
*lirp->curr_line_pos = '-';
data_position++;
}
curr_char = s_FindNthDataChar (lirp, data_position);
}
s_LineInfoReaderFree (lirp);
s_LineInfoReaderFree (master_lirp);
return rval;
}
/* This function examines each sequence and replaces the special characters
* and reports bad characters in each one. The function will return eTrue
* if any of the sequences contained bad characters or eFalse if no errors
* were seen.
*/
static EBool
s_s_FindBadDataCharsInSequenceList
(SAlignRawFilePtr afrp,
TSequenceInfoPtr sip)
{
TAlignRawSeqPtr arsp;
EBool rval = eFalse;
if (afrp == NULL || afrp->sequences == NULL) {
return eTrue;
}
for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
if (s_FindBadDataCharsInSequence (arsp, afrp->sequences, sip,
afrp->report_error,
afrp->report_error_userdata)) {
rval = eTrue;
}
}
return rval;
}
/* This function examines the organisms listed for the alignment and determines
* whether any of the organism names (including the associated comments) are
* repeated.
*/
static EBool s_AreOrganismsUnique (SAlignRawFilePtr afrp)
{
TLineInfoPtr this_org, lip;
TAlignRawSeqPtr arsp;
EBool are_unique;
if (afrp == NULL || afrp->num_organisms == 0
|| afrp->organisms == NULL) {
return eFalse;
}
are_unique = eTrue;
for (this_org = afrp->organisms;
this_org != NULL;
this_org = this_org->next) {
lip = afrp->organisms;
arsp = afrp->sequences;
while (lip != NULL && lip != this_org
&& strcmp (lip->data, this_org->data) != 0 && arsp != NULL) {
lip = lip->next;
arsp = arsp->next;
}
if (lip != NULL && lip != this_org) {
are_unique = eFalse;
s_ReportRepeatedOrganismName (arsp->id, this_org->line_num,
lip->line_num,
this_org->data,
afrp->report_error,
afrp->report_error_userdata);
}
}
return are_unique;
}
/* This function reports whether the definition lines are identical for
* each sequence or not.
*/
static EBool s_AreDeflinesIdentical (SAlignRawFilePtr afrp)
{
TLineInfoPtr lip;
TStringCountPtr list;
EBool rval;
if (afrp == NULL) {
return eFalse;
}
list = NULL;
for (lip = afrp->deflines; lip != NULL; lip = lip->next) {
list = s_AddStringCount (lip->data, lip->line_num, list);
}
rval = eTrue;
if (list != NULL && list->next != NULL) {
rval = eFalse;
s_ReportDefinitionLineMismatch (afrp->report_error,
afrp->report_error_userdata);
s_ReportDefinitionLines (list, afrp->report_error,
afrp->report_error_userdata);
}
s_StringCountFree (list);
return rval;
}
/* This function uses the contents of an SAlignRawFileData structure to
* create an SAlignmentFile structure with the appropriate information.
*/
static TAlignmentFilePtr
s_ConvertDataToOutput
(SAlignRawFilePtr afrp,
TSequenceInfoPtr sip)
{
TAlignRawSeqPtr arsp;
int index;
TSizeInfoPtr * lengths;
int * best_length;
TAlignmentFilePtr afp;
TLineInfoPtr lip;
int curr_seg;
if (afrp == NULL || sip == NULL || afrp->sequences == NULL) {
return NULL;
}
afp = AlignmentFileNew ();
if (afp == NULL) {
return NULL;
}
afp->num_organisms = afrp->num_organisms;
afp->num_deflines = afrp->num_deflines;
afp->num_segments = afrp->num_segments;
afp->num_sequences = 0;
lengths = NULL;
for (arsp = afrp->sequences; arsp != NULL; arsp = arsp->next) {
afp->num_sequences++;
}
if (afp->num_sequences != afrp->num_organisms
&& afp->num_sequences / afp->num_segments != afrp->num_organisms) {
s_ReportMissingOrganismInfo (afrp->report_error,
afrp->report_error_userdata);
} else {
s_AreOrganismsUnique (afrp);
}
afp->sequences = (char **)malloc (afp->num_sequences
* sizeof (char *));
if (afp->sequences == NULL) {
AlignmentFileFree (afp);
return NULL;
}
afp->ids = (char **)malloc (afp->num_sequences * sizeof (char *));
if (afp->ids == NULL) {
AlignmentFileFree (afp);
return NULL;
}
if (afp->num_organisms > 0) {
afp->organisms = (char **) malloc (afp->num_organisms
* sizeof (char *));
if (afp->organisms == NULL) {
AlignmentFileFree (afp);
return NULL;
}
}
if (afp->num_deflines > 0) {
afp->deflines = (char **)malloc (afp->num_deflines
* sizeof (char *));
if (afp->deflines == NULL) {
AlignmentFileFree (afp);
return NULL;
}
}
/* copy in deflines */
for (lip = afrp->deflines, index = 0;
lip != NULL && index < afp->num_deflines;
lip = lip->next, index++) {
if (lip->data == NULL) {
afp->deflines [index] = NULL;
} else {
afp->deflines [index] = strdup (lip->data);
}
}
while (index < afp->num_deflines) {
afp->deflines [index ++] = NULL;
}
/* copy in organism information */
for (lip = afrp->organisms, index = 0;
lip != NULL && index < afp->num_organisms;
lip = lip->next, index++) {
afp->organisms [index] = strdup (lip->data);
}
/* we need to store length information about different segments separately */
lengths = (TSizeInfoPtr *) malloc (sizeof (TSizeInfoPtr) * afrp->num_segments);
if (lengths == NULL) {
AlignmentFileFree (afp);
return NULL;
}
best_length = (int *) malloc (sizeof (int) * afrp->num_segments);
if (best_length == NULL) {
free (lengths);
AlignmentFileFree (afp);
return NULL;
}
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++) {
lengths [curr_seg] = NULL;
best_length [curr_seg] = 0;
}
/* copy in sequence data */
curr_seg = 0;
for (arsp = afrp->sequences, index = 0;
arsp != NULL && index < afp->num_sequences;
arsp = arsp->next, index++) {
afp->sequences [index] =
s_LineInfoMergeAndStripSpaces (arsp->sequence_data);
if (afp->sequences [index] != NULL) {
lengths [curr_seg] = s_AddSizeInfo (lengths [curr_seg], strlen (afp->sequences [index]));
}
afp->ids [index] = strdup (arsp->id);
curr_seg ++;
if (curr_seg >= afrp->num_segments) {
curr_seg = 0;
}
}
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
{
best_length [curr_seg] = s_GetMostPopularSize (lengths [curr_seg]);
if (best_length [curr_seg] == 0 && lengths [curr_seg] != NULL) {
best_length [curr_seg] = lengths [curr_seg]->size_value;
}
}
curr_seg = 0;
for (index = 0; index < afp->num_sequences; index++) {
if (afp->sequences [index] == NULL) {
s_ReportMissingSequenceData (afp->ids [index],
afrp->report_error,
afrp->report_error_userdata);
} else if ((int) strlen (afp->sequences [index]) != best_length [curr_seg]) {
s_ReportBadSequenceLength (afp->ids [index], best_length [curr_seg],
strlen (afp->sequences [index]),
afrp->report_error,
afrp->report_error_userdata);
}
curr_seg ++;
if (curr_seg >= afrp->num_segments) {
curr_seg = 0;
}
}
if (afrp->expected_num_sequence > 0
&& afrp->expected_num_sequence != afp->num_sequences)
{
s_ReportIncorrectNumberOfSequences (afrp->expected_num_sequence,
afp->num_sequences,
afrp->report_error,
afrp->report_error_userdata);
}
if (afrp->expected_sequence_len > 0
&& afrp->expected_sequence_len != best_length [0])
{
s_ReportIncorrectSequenceLength (afrp->expected_sequence_len,
best_length [0],
afrp->report_error,
afrp->report_error_userdata);
}
free (best_length);
for (curr_seg = 0; curr_seg < afrp->num_segments; curr_seg ++)
{
s_SizeInfoFree (lengths [curr_seg]);
}
free (lengths);
return afp;
}
/* This is the function called by the calling program to read an alignment
* file. The readfunc argument is a function pointer supplied by the
* calling program which this library will use to read in data from the
* file one line at a time. The fileuserdata argument is a pointer to
* data used by the calling program's readfunc function and will be passed
* back with each call to readfunc.
* The errfunc argument is a function pointer supplied by the calling
* program for reporting errors. The erroruserdata argument is a pointer
* to data used by the calling program's errfunc function and will be
* passed back with each call to readfunc.
* The sequence_info argument contains the sequence alphabet and missing,
* match, and gap characters to use in interpreting the sequence data.
*/
extern TAlignmentFilePtr
ReadAlignmentFile
(FReadLineFunction readfunc,
void * fileuserdata,
FReportErrorFunction errfunc,
void * erroruserdata,
TSequenceInfoPtr sequence_info)
{
SAlignRawFilePtr afrp;
TAlignmentFilePtr afp;
if (sequence_info == NULL || sequence_info->alphabet == NULL) {
return NULL;
}
afrp = s_ReadAlignFileRaw ( readfunc, fileuserdata, sequence_info,
errfunc, erroruserdata);
if (afrp == NULL) {
return NULL;
}
if (afrp->block_size > 1) {
s_ProcessAlignRawFileByBlockOffsets (afrp);
} else if (afrp->marked_ids) {
s_ProcessAlignFileRawForMarkedIDs (afrp);
} else {
s_ProcessAlignFileRawByLengthPattern (afrp);
}
s_ReprocessIds (afrp);
#if 0 /* this step was removed by indexer request */
/* Note - have to check deflines after reprocessing IDs */
s_AreDeflinesIdentical (afrp);
#endif
if (s_s_FindBadDataCharsInSequenceList (afrp, sequence_info)) {
s_AlignFileRawFree (afrp);
return NULL;
}
afp = s_ConvertDataToOutput (afrp, sequence_info);
s_AlignFileRawFree (afrp);
return afp;
}
/*
* ===========================================================================
* $Log: alnread.c,v $
* Revision 1000.1 2004/06/01 19:41:15 gouriano
* PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.10
*
* Revision 1.10 2004/05/20 19:40:24 bollin
* Made chnages to allow reading of alignments of segmented sets.
* Also added warnings for when organism lines may be present but improperly
* formatted.
*
* Revision 1.9 2004/03/16 21:05:15 bollin
* Added some improvements to the portion of the alignment reader that deals
* with contiguous alignments that do not have a '>' at the beginning of each
* ID.
*
* Revision 1.8 2004/03/16 16:25:38 bollin
* Added function to recognize a file as ASN.1 and reject immediately
*
* Revision 1.7 2004/03/09 21:27:39 bollin
* in s_InsertNewOffsets, if the list ends while searching for the next pattern, exit immediately (prevents NULL pointer access)
*
* Revision 1.6 2004/03/04 19:15:07 bollin
* file reading now skips over multi-line bracketed comments
*
* Revision 1.5 2004/03/04 16:29:32 bollin
* added skip of taxa comment for PAUP format alignment files
*
* Revision 1.4 2004/02/10 16:15:13 bollin
* now checks for unused lines when finding interleaved blocks, will reject and try other methods if unused lines found after first block found.
*
* Revision 1.3 2004/02/05 16:29:32 bollin
* smarter function for skipping NEXUS comment lines
*
* Revision 1.2 2004/02/04 19:49:11 bollin
* fixed infinite loop condition in s_AugmentOffsetList, properly skip over first non-space column when looking for interleaved block patterns in s_ReadAlignFileRaw
*
* Revision 1.1 2004/02/03 16:47:02 ucko
* Add Colleen Bollin's Toolkit-independent alignment reader.
*
* Revision 1.38 2004/01/30 22:46:08 bollin
* renamed defined variable, fixed typo in comment
*
* Revision 1.37 2004/01/30 21:48:14 bollin
* changes for compatibility with Windows
*
* Revision 1.36 2004/01/30 21:33:41 bollin
* replaced strncasecmp and strncase function calls
*
* Revision 1.35 2004/01/29 19:16:27 bollin
* use EBool for boolean values
*
* Revision 1.34 2004/01/29 17:58:11 bollin
* aligned assignment blocks in New functions
*
* Revision 1.33 2004/01/29 17:43:40 bollin
* added directory specification to alnread.h include line
*
* Revision 1.32 2004/01/29 17:41:29 bollin
* added comment block, id tags, log
*
* ===========================================================================
*/