alnread.c
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:173k
- /*
- * ===========================================================================
- * PRODUCTION $Log: alnread.c,v $
- * PRODUCTION Revision 1000.1 2004/06/01 19:41:15 gouriano
- * PRODUCTION PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.10
- * PRODUCTION
- * ===========================================================================
- */
- /*
- * $Id: alnread.c,v 1000.1 2004/06/01 19:41:15 gouriano Exp $
- *
- * ===========================================================================
- *
- * PUBLIC DOMAIN NOTICE
- * National Center for Biotechnology Information
- *
- * This software/database is a "United States Government Work" under the
- * terms of the United States Copyright Act. It was written as part of
- * the author's official duties as a United States Government employee and
- * thus cannot be copyrighted. This software/database is freely available
- * to the public for use. The National Library of Medicine and the U.S.
- * Government have not placed any restriction on its use or reproduction.
- *
- * Although all reasonable efforts have been taken to ensure the accuracy
- * and reliability of the software and data, the NLM and the U.S.
- * Government do not and cannot warrant the performance or results that
- * may be obtained by using this software or data. The NLM and the U.S.
- * Government disclaim all warranties, express or implied, including
- * warranties of performance, merchantability or fitness for any particular
- * purpose.
- *
- * Please cite the author in any work or product based on this material.
- *
- * ===========================================================================
- *
- * Authors: Colleen Bollin
- *
- */
- #include <util/creaders/alnread.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <ctype.h>
- static const int kMaxPrintedIntLen = 10;
- #define MAX_PRINTED_INT_LEN_PLUS_ONE 11
- typedef enum {
- eTrue = -1,
- eFalse = 0
- } EBool;
- /* structures used internally */
- typedef struct SLineInfo {
- char * data;
- int line_num;
- int line_offset;
- EBool delete_me;
- struct SLineInfo * next;
- } SLineInfo, * TLineInfoPtr;
- typedef struct SLineInfoReader {
- TLineInfoPtr first_line;
- TLineInfoPtr curr_line;
- char * curr_line_pos;
- int data_pos;
- } SLineInfoReader, * TLineInfoReaderPtr;
- typedef struct SIntLink {
- int ival;
- struct SIntLink * next;
- } SIntLink, * TIntLinkPtr;
- typedef struct SStringCount {
- char * string;
- int num_appearances;
- TIntLinkPtr line_numbers;
- struct SStringCount * next;
- } SStringCount, * TStringCountPtr;
- typedef struct SSizeInfo {
- int size_value;
- int num_appearances;
- struct SSizeInfo * next;
- } SSizeInfo, * TSizeInfoPtr;
- typedef struct SLengthList {
- TSizeInfoPtr lengthrepeats;
- int num_appearances;
- struct SLengthList * next;
- } SLengthListData, * SLengthListPtr;
-
- typedef struct SCommentLoc {
- char * start;
- char * end;
- struct SCommentLoc * next;
- } SCommentLoc, * TCommentLocPtr;
- typedef struct SBracketedCommentList
- {
- TLineInfoPtr comment_lines;
- struct SBracketedCommentList * next;
- } SBracketedCommentList, * TBracketedCommentListPtr;
- typedef struct SAlignRawSeq {
- char * id;
- TLineInfoPtr sequence_data;
- TIntLinkPtr id_lines;
- struct SAlignRawSeq * next;
- } SAlignRawSeq, * TAlignRawSeqPtr;
- typedef struct SAlignFileRaw {
- TLineInfoPtr line_list;
- TLineInfoPtr organisms;
- TAlignRawSeqPtr sequences;
- int num_organisms;
- TLineInfoPtr deflines;
- int num_deflines;
- EBool marked_ids;
- int block_size;
- TIntLinkPtr offset_list;
- FReportErrorFunction report_error;
- void * report_error_userdata;
- char * alphabet;
- int expected_num_sequence;
- int expected_sequence_len;
- int num_segments;
- } SAlignRawFileData, * SAlignRawFilePtr;
- /* These functions are used for storing and transmitting information
- * about errors encountered while reading the alignment data.
- */
- /* This function allocates memory for a new error structure and populates
- * the structure with default values.
- * The new structure will be added to the end of the linked list of error
- * structures pointed to by list.
- */
- extern TErrorInfoPtr ErrorInfoNew (TErrorInfoPtr list)
- {
- TErrorInfoPtr eip, last;
- eip = (TErrorInfoPtr) malloc ( sizeof (SErrorInfo));
- if (eip == NULL) {
- return NULL;
- }
- eip->category = eAlnErr_Unknown;
- eip->line_num = -1;
- eip->id = NULL;
- eip->message = NULL;
- eip->next = NULL;
- last = list;
- while (last != NULL && last->next != NULL) {
- last = last->next;
- }
- if (last != NULL) {
- last->next = eip;
- }
- return eip;
- }
- /* This function recursively frees the memory associated with a list of
- * error structures as well as the member variables of the error structures.
- */
- extern void ErrorInfoFree (TErrorInfoPtr eip)
- {
- if (eip == NULL) {
- return;
- }
- ErrorInfoFree (eip->next);
- free (eip->id);
- free (eip->message);
- free (eip);
- }
- /* This function creates and sends an error message regarding a NEXUS comment
- * character.
- */
- static void
- s_ReportCharCommentError
- (char * expected,
- char seen,
- char * val_name,
- FReportErrorFunction errfunc,
- void * errdata)
- {
- TErrorInfoPtr eip;
- const char * errformat = "Specified %s character does not match NEXUS"
- " comment in file (specified %s, comment %c)";
- if (errfunc == NULL || val_name == NULL || expected == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip != NULL) {
- eip->category = eAlnErr_BadFormat;
- eip->message = (char *) malloc (strlen (errformat) + strlen (val_name)
- + strlen (expected) + 2);
- if (eip->message != NULL) {
- sprintf (eip->message, errformat, val_name, expected, seen);
- }
- errfunc (eip, errdata);
- }
- }
- /* This function creates and sends an error message regarding a character
- * that is unexpected in sequence data.
- */
- static void
- s_ReportBadCharError
- (char * id,
- char bad_char,
- int num_bad,
- int offset,
- int line_number,
- char * reason,
- FReportErrorFunction errfunc,
- void * errdata)
- {
- TErrorInfoPtr eip;
- const char * err_format =
- "%d bad characters (%c) found at position %d (%s).";
- if (errfunc == NULL || num_bad == 0 || bad_char == 0
- || reason == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_BadData;
- if (id != NULL) eip->id = strdup (id);
- eip->line_num = line_number;
- eip->message = (char *) malloc (strlen (err_format) + 2 * kMaxPrintedIntLen
- + strlen (reason) + 3);
- if (eip->message != NULL)
- {
- sprintf (eip->message, err_format, num_bad, bad_char, offset, reason);
- }
- errfunc (eip, errdata);
- }
-
- /* This function creates and sends an error message regarding an ID that
- * was found in the wrong location.
- */
- static void
- s_ReportInconsistentID
- (char * id,
- int line_number,
- FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TErrorInfoPtr eip;
- if (report_error == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_BadFormat;
- eip->id = strdup (id);
- eip->line_num = line_number;
- eip->message = strdup ("Found unexpected ID");
- report_error (eip, report_error_userdata);
- }
- /* This function creates and sends an error message regarding a line
- * of sequence data that was expected to have a different length.
- */
- static void
- s_ReportInconsistentBlockLine
- (char * id,
- int line_number,
- FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TErrorInfoPtr eip;
- if (report_error == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_BadFormat;
- eip->id = strdup (id);
- eip->line_num = line_number;
- eip->message = strdup ("Inconsistent block line formatting");
- report_error (eip, report_error_userdata);
- }
- /* This function creates and sends an error message regarding mismatched
- * definition lines
- */
- static void
- s_ReportDefinitionLineMismatch
- (FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TErrorInfoPtr eip;
- if (report_error == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_BadData;
- eip->message = strdup ("Mismatched definition lines");
- report_error (eip, report_error_userdata);
- }
- /* This function recursively creates and sends an error message
- * regarding the number of times items in list appear.
- */
- static void
- s_ReportDefinitionLines
- (TStringCountPtr list,
- FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TErrorInfoPtr eip;
- const char * err_null_format = "Null definition line occurs %d times";
- const char * err_format = "Definition line %s occurs %d times";
- if (list == NULL || report_error == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_BadData;
- if (list->string == NULL) {
- eip->message = malloc (strlen (err_null_format)
- + kMaxPrintedIntLen + 1);
- if (eip->message != NULL) {
- sprintf (eip->message, err_null_format, list->num_appearances);
- }
- } else {
- eip->message = malloc (strlen (err_format)
- + strlen (list->string)
- + kMaxPrintedIntLen + 1);
- if (eip->message != NULL) {
- sprintf (eip->message, err_format, list->string,
- list->num_appearances);
- }
- }
- report_error (eip, report_error_userdata);
-
- s_ReportDefinitionLines (list->next, report_error, report_error_userdata);
- }
-
- /* This function creates and sends an error message regarding a line of
- * sequence data that was expected to be a different length.
- */
- static void
- s_ReportLineLengthError
- (char * id,
- TLineInfoPtr lip,
- int expected_length,
- FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TErrorInfoPtr eip;
- char * msg;
- const char * format = "Expected line length %d, actual length %d";
- int len;
- if (lip == NULL || report_error == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_BadFormat;
- eip->id = strdup (id);
- eip->line_num = lip->line_num;
- msg = (char *) malloc (strlen (format) + kMaxPrintedIntLen + 1);
- if (msg != NULL) {
- if (lip->data == NULL) {
- len = 0;
- } else {
- len = strlen (lip->data);
- }
- sprintf (msg, format, expected_length, len);
- eip->message = msg;
- }
- report_error (eip, report_error_userdata);
- }
- /* This function creates and sends an error message regarding a block of
- * sequence data that was expected to contain more lines.
- */
- static void
- s_ReportBlockLengthError
- (char * id,
- int line_num,
- int expected_num,
- int actual_num,
- FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TErrorInfoPtr eip;
- const char * err_format = "Expected %d lines in block, found %d";
- if (report_error == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_BadFormat;
- eip->id = strdup (id);
- eip->line_num = line_num;
- eip->message = malloc (strlen (err_format) + 2 * kMaxPrintedIntLen + 1);
- if (eip->message != NULL) {
- sprintf (eip->message, err_format, expected_num, actual_num);
- }
- report_error (eip, report_error_userdata);
- }
- /* This function creates and sends an error message regarding missing
- * sequence data.
- */
- static void
- s_ReportMissingSequenceData
- (char * id,
- FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TErrorInfoPtr eip;
- if (report_error == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_Fatal;
- eip->id = strdup (id);
- eip->message = strdup ("No data found");
- report_error (eip, report_error_userdata);
- }
- /* This function creates and sends an error message indicating that the
- * most common length of the sequences in the file do not match a comment
- * found in the file.
- */
- static void
- s_ReportBadSequenceLength
- (char * id,
- int expected_length,
- int actual_length,
- FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TErrorInfoPtr eip;
- const char * format_str = "Expected sequence length %d, actual length %d";
- if (report_error == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_BadFormat;
- eip->id = strdup (id);
- eip->message = malloc (strlen (format_str) + 50);
- if (eip->message != NULL) {
- sprintf (eip->message, format_str, expected_length, actual_length);
- }
- report_error (eip, report_error_userdata);
- }
- /* This function creates and sends an error message indicating that the
- * number of sequences read does not match a comment in the alignment file.
- */
- static void
- s_ReportIncorrectNumberOfSequences
- (int num_expected,
- int num_found,
- FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TErrorInfoPtr eip;
- const char * err_format = "Expected %d sequences, found %d";
-
- if (report_error == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_BadFormat;
- eip->message = (char *) malloc (strlen (err_format) +
- 2 * kMaxPrintedIntLen + 1);
-
- if (eip->message != NULL)
- {
- sprintf (eip->message, err_format, num_expected, num_found);
- }
- report_error (eip, report_error_userdata);
- }
- static void
- s_ReportIncorrectSequenceLength
- (int len_expected,
- int len_found,
- FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TErrorInfoPtr eip;
- const char * err_format = "Expected sequences of length %d, found %d";
- if (report_error == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_BadFormat;
- eip->message = (char *)malloc (strlen (err_format)
- + 2 * kMaxPrintedIntLen + 1);
- if (eip->message != NULL)
- {
- sprintf (eip->message, err_format, len_expected, len_found);
- }
- report_error (eip, report_error_userdata);
- }
- /* This function creates and sends an error message regarding a non-unique
- * organism name.
- */
- static void
- s_ReportRepeatedOrganismName
- (char * id,
- int line_num,
- int second_line_num,
- char * org_name,
- FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TErrorInfoPtr eip;
- const char * err_format = "Organism name %s also appears at line %d";
- if (report_error == NULL || org_name == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_BadData;
- eip->line_num = line_num;
- if (id != NULL ) {
- eip->id = strdup (id);
- }
- eip->message = malloc (strlen (err_format) + strlen (org_name)
- + kMaxPrintedIntLen + 1);
- if (eip->message != NULL) {
- sprintf (eip->message, err_format, org_name, second_line_num);
- }
- report_error (eip, report_error_userdata);
- }
- /* This function creates and sends an error message indicating that some or
- * all of the organism information for the sequences are missing.
- */
- static void
- s_ReportMissingOrganismInfo
- (FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TErrorInfoPtr eip;
- if (report_error == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_BadData;
- eip->message = strdup ("Missing organism information");
- report_error (eip, report_error_userdata);
- }
- /* This function creates and sends an error message regarding an ID that is
- * used for more than one sequence.
- */
- static void
- s_ReportRepeatedId
- (TStringCountPtr scp,
- FReportErrorFunction report_error,
- void * report_error_userdata)
- {
- TErrorInfoPtr eip;
- const char * err_format = "ID %s appears in the following locations:";
- char * cp;
- TIntLinkPtr line_number;
- if (report_error == NULL || scp == NULL || scp->string == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip == NULL) {
- return;
- }
- eip->category = eAlnErr_BadData;
- eip->id = strdup (scp->string);
- if (scp->line_numbers != NULL) {
- eip->line_num = scp->line_numbers->ival;
- }
- eip->message = (char *) malloc ( strlen (err_format)
- + strlen (scp->string)
- + scp->num_appearances * 15
- + 1);
- if (eip->message != NULL) {
- sprintf (eip->message, err_format, scp->string);
- cp = eip->message + strlen (eip->message);
- for (line_number = scp->line_numbers;
- line_number != NULL;
- line_number = line_number->next) {
- sprintf (cp, " %d", line_number->ival);
- cp += strlen (cp);
- }
- }
- report_error (eip, report_error_userdata);
- }
- /* This function creates and sends an error message indicating that the file
- * being read is an ASN.1 file.
- */
- static void
- s_ReportASN1Error
- (FReportErrorFunction errfunc,
- void * errdata)
- {
- TErrorInfoPtr eip;
- const char * msg = "This is an ASN.1 file, "
- "which cannot be read by this function.";
- if (errfunc == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip != NULL) {
- eip->category = eAlnErr_BadData;
- eip->message = (char *) malloc (strlen (msg) + 1);
- if (eip->message != NULL) {
- sprintf (eip->message, msg);
- }
- errfunc (eip, errdata);
- }
- }
- /* This function reports that some sequences are inside brackets (indicating a segmented set)
- * and that some sequences are outside the brackets.
- */
- static void
- s_ReportSegmentedAlignmentError
- (TIntLinkPtr offset_list,
- FReportErrorFunction errfunc,
- void * errdata)
- {
- TErrorInfoPtr eip;
- const char * msg = "This file contains sequences in brackets (indicating "
- "a segmented alignment) as well as sequences not in brackets at lines "
- "%s. Please either add or remove brackets to correct this problem.";
- int num_lines = 0;
- int msg_len = 0;
- TIntLinkPtr t;
- char * line_text_list;
- char * line_text_list_offset;
- if (errfunc == NULL || offset_list == NULL) {
- return;
- }
- for (t = offset_list; t != NULL; t = t->next)
- {
- num_lines ++;
- }
- msg_len = num_lines * (kMaxPrintedIntLen + 2);
- if (num_lines > 1)
- {
- msg_len += 4;
- }
- line_text_list = (char *) malloc (msg_len);
- if (line_text_list == NULL) return;
- line_text_list_offset = line_text_list;
- for (t = offset_list; t != NULL; t = t->next)
- {
- if (t->next == NULL)
- {
- sprintf (line_text_list_offset, "%d", t->ival);
- }
- else if (num_lines == 2)
- {
- sprintf (line_text_list_offset, "%d and ", t->ival);
- }
- else if (t->next->next == NULL)
- {
- sprintf (line_text_list_offset, "%d, and ", t->ival);
- }
- else
- {
- sprintf (line_text_list_offset, "%d, ", t->ival);
- }
- line_text_list_offset += strlen (line_text_list_offset);
- }
- msg_len += strlen (msg) + 1;
- eip = ErrorInfoNew (NULL);
- if (eip != NULL) {
- eip->category = eAlnErr_BadData;
- eip->message = (char *) malloc (msg_len);
- if (eip->message != NULL) {
- sprintf (eip->message, msg, line_text_list);
- }
- errfunc (eip, errdata);
- }
- free (line_text_list);
- }
- /* This function reports an error if a line looks like it might contain an organism comment
- * but is somehow improperly formatted
- */
- static void s_ReportOrgCommentError
- (char * linestring,
- FReportErrorFunction errfunc,
- void * errdata)
- {
- TErrorInfoPtr eip;
- const char * msg = "This line may contain an improperly formatted organism description.n"
- "Organism descriptions should be of the form [org=tax name] or [organism=tax name].n";
-
- if (errfunc == NULL || linestring == NULL) {
- return;
- }
-
- eip = ErrorInfoNew (NULL);
- if (eip != NULL) {
- eip->category = eAlnErr_BadData;
- eip->message = (char *) malloc (strlen (msg) + strlen (linestring) + 1);
- if (eip->message != NULL) {
- strcpy (eip->message, msg);
- strcat (eip->message, linestring);
- }
- errfunc (eip, errdata);
- }
- }
-
- /* This function reports that the number of segments in an alignment of
- * segmented sets is inconsistent.
- */
- static void s_ReportBadNumSegError
- (int line_num,
- int num_seg,
- int num_seg_exp,
- FReportErrorFunction errfunc,
- void * errdata)
- {
- TErrorInfoPtr eip;
- const char * msg = "This segmented set contains a different number of segments (%d) than expected (%d).n";
-
- if (errfunc == NULL) {
- return;
- }
-
- eip = ErrorInfoNew (NULL);
- if (eip != NULL) {
- eip->line_num = line_num;
- eip->category = eAlnErr_BadData;
- eip->message = (char *) malloc (strlen (msg) + 2 * kMaxPrintedIntLen + 1);
- if (eip->message != NULL) {
- sprintf (eip->message, msg, num_seg, num_seg_exp);
- }
- errfunc (eip, errdata);
- }
- }
-
- /* This function allocates memory for a SSequenceInfo structure and
- * initializes the member variables. It returns a pointer to the newly
- * allocated memory.
- */
- extern TSequenceInfoPtr SequenceInfoNew (void)
- {
- TSequenceInfoPtr sip;
- sip = (TSequenceInfoPtr) malloc (sizeof (SSequenceInfo));
- if (sip == NULL) {
- return NULL;
- }
- sip->missing = strdup ("?");
- sip->beginning_gap = strdup (".");
- sip->middle_gap = strdup ("-");
- sip->end_gap = strdup (".");
- sip->match = strdup (".");
- sip->alphabet = NULL;
- return sip;
- }
- /* This function frees memory associated with the member variables of
- * the SSequenceInfo structure and with the structure itself.
- */
- extern void SequenceInfoFree (TSequenceInfoPtr sip)
- {
- if (sip == NULL) {
- return;
- }
- free (sip->alphabet);
- free (sip->missing);
- free (sip->beginning_gap);
- free (sip->middle_gap);
- free (sip->end_gap);
- free (sip->match);
- sip->alphabet = NULL;
- free (sip);
- }
- /* This function creates and sends an error message regarding an unused line.
- */
- static void
- s_ReportUnusedLine
- (int line_num_start,
- int line_num_stop,
- TLineInfoPtr line_val,
- FReportErrorFunction errfunc,
- void * errdata)
- {
- TErrorInfoPtr eip;
- const char * errformat1 = "Line %d could not be assigned to an interleaved block";
- const char * errformat2 = "Lines %d through %d could not be assigned to an interleaved block";
- const char * errformat3 = "Contents of unused line: %s";
- int skip;
- if (errfunc == NULL || line_val == NULL) {
- return;
- }
- eip = ErrorInfoNew (NULL);
- if (eip != NULL) {
- eip->category = eAlnErr_BadFormat;
- eip->line_num = line_num_start;
- if (line_num_start == line_num_stop) {
- eip->message = (char *) malloc (strlen (errformat1)
- + kMaxPrintedIntLen + 1);
- if (eip->message != NULL) {
- sprintf (eip->message, errformat1, line_num_start);
- }
- } else {
- eip->message = (char *) malloc (strlen (errformat2)
- + 2 * kMaxPrintedIntLen + 1);
- if (eip->message != NULL) {
- sprintf (eip->message, errformat2, line_num_start,
- line_num_stop);
- }
- }
- errfunc (eip, errdata);
- }
- /* report contents of unused lines */
- for (skip = line_num_start;
- skip < line_num_stop + 1 && line_val != NULL;
- skip++) {
- if (line_val->data == NULL) {
- continue;
- }
- eip = ErrorInfoNew (NULL);
- if (eip != NULL) {
- eip->category = eAlnErr_BadFormat;
- eip->line_num = skip;
- eip->message = (char *) malloc (strlen (errformat3)
- + strlen (line_val->data) + 1);
- if (eip->message != NULL) {
- sprintf (eip->message, errformat3, line_val->data);
- }
- errfunc (eip, errdata);
- }
- line_val = line_val->next;
- }
- }
- /* The following functions are used to manage a linked list of integer
- * values.
- */
- /* This function creates a new SIntLink structure with a value of ival.
- * The new structure will be placed at the end of list if list is not NULL.
- * The function will return a pointer to the new structure.
- */
- static TIntLinkPtr
- s_IntLinkNew
- (int ival,
- TIntLinkPtr list)
- {
- TIntLinkPtr ilp, last;
- ilp = (TIntLinkPtr) malloc (sizeof (SIntLink));
- if (ilp == NULL) {
- return NULL;
- }
- ilp->ival = ival;
- ilp->next = NULL;
- last = list;
- while (last != NULL && last->next != NULL) {
- last = last->next;
- }
- if (last != NULL) {
- last->next = ilp;
- }
- return ilp;
- }
- /* This function recursively frees memory associated with a linked list
- * of SIntLink structures.
- */
- static void s_IntLinkFree (TIntLinkPtr ilp)
- {
- if (ilp == NULL) {
- return;
- }
- s_IntLinkFree (ilp->next);
- free (ilp);
- }
- /* These functions are used to accumulate and retrieve information on
- * how often a size of data (number of lines or number of characters) occurs.
- */
- /* This function allocates space for a new SSizeInfo structure and
- * initializes its member variables. If list is not NULL, the new structure
- * is added to the end of the list.
- * The function returns a pointer to the newly allocated structure.
- */
- static TSizeInfoPtr s_SizeInfoNew (TSizeInfoPtr list)
- {
- TSizeInfoPtr sip, last;
- sip = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
- if (sip == NULL) {
- return NULL;
- }
- sip->size_value = 0;
- sip->num_appearances = 0;
- sip->next = NULL;
- last = list;
- while (last != NULL && last->next != NULL) {
- last = last->next;
- }
- if (last != NULL) {
- last->next = sip;
- }
- return sip;
- }
- /* This function recursively frees the memory associated with a linked list
- * of SSizeInfo structures.
- */
- static void s_SizeInfoFree (TSizeInfoPtr list)
- {
- if (list == NULL) {
- return;
- }
- s_SizeInfoFree (list->next);
- list->next = NULL;
- free (list);
- }
- /* This function returns eTrue if the two SSizeInfo structures have
- * the same size_value and number of appearances, eFalse otherwise.
- */
- static EBool
- s_SizeInfoIsEqual
- (TSizeInfoPtr s1,
- TSizeInfoPtr s2)
- {
- if (s1 == NULL
- || s2 == NULL
- || s1->size_value != s2->size_value
- || s1->num_appearances != s2->num_appearances) {
- return eFalse;
- }
- return eTrue;
- }
- /* This function searches list for a SSizeInfo structure with the
- * same size_value as size_value. If it finds such a structure, it
- * adds the value of num_appearances to the num_appearances for that
- * structure, otherwise the function creates a new structure at the end
- * of the list with the specified values of size_value and num_appearances.
- * The function returns a pointer to the list of SSizeInfo structures.
- */
- static TSizeInfoPtr s_AddSizeInfoAppearances
- (TSizeInfoPtr list,
- int size_value,
- int num_appearances)
- {
- TSizeInfoPtr p, last;
- last = NULL;
- for (p = list; p != NULL && p->size_value != size_value; p = p->next) {
- last = p;
- }
- if (p == NULL) {
- p = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
- if (p == NULL) {
- return NULL;
- }
- p->size_value = size_value;
- p->num_appearances = num_appearances;
- p->next = 0;
- if (last == NULL) {
- list = p;
- } else {
- last->next = p;
- }
- } else {
- p->num_appearances += num_appearances;
- }
- return list;
- }
- /* This function searches list for a SSizeInfo structure with the
- * same size_value as size_value. If it finds such a structure, it
- * adds one to the num_appearances for that structure, otherwise the
- * function creates a new structure at the end of the list with the
- * specified values of size_value and num_appearances.
- * The function returns a pointer to the list of SSizeInfo structures.
- */
- static TSizeInfoPtr
- s_AddSizeInfo
- (TSizeInfoPtr list,
- int size_value)
- {
- return s_AddSizeInfoAppearances (list, size_value, 1);
- }
- /* This function searches list for the SSizeInfo structure with the
- * highest value for num_appearances. If more than one structure exists
- * with the highest value for num_appearances, the function chooses the
- * value with the highest value for size_value. The function returns a
- * pointer to the structure selected based on the above criteria.
- */
- static TSizeInfoPtr s_GetMostPopularSizeInfo (TSizeInfoPtr list)
- {
- TSizeInfoPtr p, best;
- if (list == NULL) {
- return NULL;
- }
- best = list;
- for (p = list->next; p != NULL; p = p->next) {
- if (p->num_appearances > best->num_appearances
- || (p->num_appearances == best->num_appearances
- && p->size_value > best->size_value)) {
- best = p;
- }
- }
- return best;
- }
- /* This function uses s_GetMostPopularSizeInfo function to find the structure
- * in list that has the highest value for num_appearances and size_value.
- * If such a structure is found and has a num_appearances value greater than
- * one, the size_value for that structure will be returned, otherwise the
- * function returns 0.
- */
- static int s_GetMostPopularSize (TSizeInfoPtr list)
- {
- TSizeInfoPtr best;
- best = s_GetMostPopularSizeInfo (list);
- if (best == NULL) {
- return 0;
- }
- if (best->num_appearances > 1) {
- return best->size_value;
- } else {
- return 0;
- }
- }
- /* The following functions are used to keep track of patterns of line or
- * token lengths, which will be used to identify errors in formatting.
- */
- static SLengthListPtr s_LengthListNew (SLengthListPtr list)
- {
- SLengthListPtr llp, last;
- llp = (SLengthListPtr) malloc (sizeof (SLengthListData));
- if (llp == NULL) {
- return NULL;
- }
- llp->lengthrepeats = NULL;
- llp->num_appearances = 0;
- llp->next = NULL;
- last = list;
- while (last != NULL && last->next != NULL) {
- last = last->next;
- }
- if (last != NULL) {
- last->next = llp;
- }
- return llp;
- }
- /* This function recursively frees memory for a list of SLengthListData
- * structures and its member variables.
- */
- static void s_LengthListFree (SLengthListPtr llp)
- {
- if (llp == NULL) {
- return;
- }
- s_LengthListFree (llp->next);
- s_SizeInfoFree (llp->lengthrepeats);
- free (llp);
- }
- /* This function examines the last SSizeInfo structure in the
- * lengthrepeats member variable of llp. If the last structure
- * in the list has the same size_value value as the function argument
- * size_value, the value of num_appearances for that SizeInforData structure
- * will be incremented. Otherwise a new SSizeInfo structure will be
- * appended to the end of the lengthrepeats list with the specified
- * size_value and a num_appearances value of 1.
- */
- static void
- s_AddLengthRepeat
- (SLengthListPtr llp,
- int size_value)
- {
- TSizeInfoPtr p, last;
- if (llp == NULL) {
- return;
- }
- last = NULL;
- for (p = llp->lengthrepeats; p != NULL; p = p->next) {
- last = p;
- }
- if (last == NULL || last->size_value != size_value) {
- p = (TSizeInfoPtr) malloc (sizeof (SSizeInfo));
- if (p == NULL) {
- return;
- }
- p->size_value = size_value;
- p->num_appearances = 1;
- p->next = 0;
- if (last == NULL) {
- llp->lengthrepeats = p;
- } else {
- last->next = p;
- }
- } else {
- last->num_appearances ++;
- }
- }
- /* This function examines whether two SLengthListData structures "match" -
- * the structures match if each SSizeInfo structure in llp1->lengthrepeats
- * has the same size_value and num_appearances values as the SSizeInfo
- * structure in the corresponding list position in llp2->lenghrepeats.
- * If the two structures match, the function returns eTrue, otherwise the
- * function returns eFalse.
- */
- static EBool
- s_DoLengthPatternsMatch
- (SLengthListPtr llp1,
- SLengthListPtr llp2)
- {
- TSizeInfoPtr sip1, sip2;
- if (llp1 == NULL || llp2 == NULL
- || llp1->lengthrepeats == NULL
- || llp2->lengthrepeats == NULL) {
- return eFalse;
- }
- for (sip1 = llp1->lengthrepeats, sip2 = llp2->lengthrepeats;
- sip1 != NULL && sip2 != NULL;
- sip1 = sip1->next, sip2 = sip2->next) {
- if ( ! s_SizeInfoIsEqual (sip1, sip2)
- || (sip1->next == NULL && sip2->next != NULL)
- || (sip1->next != NULL && sip2->next == NULL)) {
- return eFalse;
- }
- }
- return eTrue;
- }
- /* This function examines a list of SLengthListData structures to see if
- * one of them matches llp. If so, the value of num_appearances in that
- * list is incremented by one and llp is freed, otherwise llp is added
- * to the end of the list.
- * The function returns a pointer to the list of LenghtListData structures.
- */
- static SLengthListPtr
- s_AddLengthList
- (SLengthListPtr list,
- SLengthListPtr llp)
- {
- SLengthListPtr prev_llp;
- if (list == NULL) {
- list = llp;
- } else {
- prev_llp = list;
- while ( prev_llp->next && ! s_DoLengthPatternsMatch (prev_llp, llp)) {
- prev_llp = prev_llp->next;
- }
- if (s_DoLengthPatternsMatch (prev_llp, llp)) {
- prev_llp->num_appearances ++;
- s_LengthListFree (llp);
- } else {
- prev_llp->next = llp;
- }
- }
- return list;
- }
- /* This function examines the last SLengthListData structure in list to
- * see if it matches llp. If so, the function increments the value of
- * num_appearances for the last SLengthListData structure in list and
- * frees llp, otherwise the function appends llp to the end of list.
- * The function returns a pointer to the list of SLengthListData structures.
- */
- static SLengthListPtr
- s_AddPatternRepeat
- (SLengthListPtr list,
- SLengthListPtr llp)
- {
- SLengthListPtr prev_llp;
- if (list == NULL) {
- list = llp;
- } else {
- prev_llp = list;
- while ( prev_llp->next != NULL ) {
- prev_llp = prev_llp->next;
- }
- if (s_DoLengthPatternsMatch (prev_llp, llp)) {
- prev_llp->num_appearances ++;
- s_LengthListFree (llp);
- } else {
- prev_llp->next = llp;
- }
- }
- return list;
- }
- /* This set of functions is used for storing and analyzing individual lines
- * or tokens from an alignment file.
- */
- /* This function allocates memory for a new SLineInfo structure and
- * initializes the structure with a saved copy of string and the specified
- * values of line_num and line_offset.
- * The function returns a pointer to the new SLineInfo structure.
- */
- static TLineInfoPtr
- s_LineInfoNew
- (char * string,
- int line_num,
- int line_offset)
- {
- TLineInfoPtr lip;
- lip = (TLineInfoPtr) malloc (sizeof (SLineInfo));
- if (lip == NULL) {
- return NULL;
- }
- lip->data = strdup (string);
- lip->line_num = line_num;
- lip->line_offset = line_offset;
- lip->delete_me = eFalse;
- lip->next = NULL;
- return lip;
- }
- /* This function recursively frees the memory associated with the structures
- * and members of the structures in a linked list of SLineInfo structures.
- */
- static void s_LineInfoFree (TLineInfoPtr lip)
- {
- if (lip == NULL) {
- return;
- }
- s_LineInfoFree (lip->next);
- lip->next = NULL;
- free (lip->data);
- free (lip);
- }
- /* This function deletes from a linked list of SLineInfo structures
- * those structures for which the delete_me flag has been set. The function
- * returns a pointer to the beginning of the new list.
- */
- static TLineInfoPtr s_DeleteLineInfos (TLineInfoPtr list)
- {
- TLineInfoPtr prev = NULL;
- TLineInfoPtr lip, nextlip;
- lip = list;
- while (lip != NULL) {
- nextlip = lip->next;
- if (lip->delete_me) {
- if (prev != NULL) {
- prev->next = lip->next;
- } else {
- list = lip->next;
- }
- lip->next = NULL;
- s_LineInfoFree (lip);
- } else {
- prev = lip;
- }
- lip = nextlip;
- }
- return list;
- }
-
-
- /* This function creates a new SLineInfo structure, populates it with
- * a copy of string and the specified line_num and line_offset values,
- * and appends it to the end of "list" if list is not NULL.
- * The function will return a pointer to the newly created structure
- * if list is NULL, otherwise the function will return list.
- */
- static TLineInfoPtr
- s_AddLineInfo
- (TLineInfoPtr list,
- char * string,
- int line_num,
- int line_offset)
- {
- TLineInfoPtr lip, p;
- if (string == NULL) {
- return list;
- }
- lip = s_LineInfoNew (string, line_num, line_offset);
- if (lip == NULL) {
- return NULL;
- }
- if (list == NULL) {
- list = lip;
- } else {
- p = list;
- while (p != NULL && p->next != NULL) {
- p = p->next;
- }
- p->next = lip;
- }
- return list;
- }
- /* This function creates a new bracketed comment */
- static TBracketedCommentListPtr s_BracketedCommentListNew
- (TBracketedCommentListPtr list,
- char * string,
- int line_num,
- int line_offset)
- {
- TBracketedCommentListPtr comment;
-
- comment = (TBracketedCommentListPtr) malloc (sizeof (SBracketedCommentList));
- if (comment == NULL) {
- return NULL;
- }
- comment->comment_lines = s_LineInfoNew (string, line_num, line_offset);
- comment->next = NULL;
-
- if (list != NULL) {
- while (list->next != NULL) {
- list = list->next;
- }
- list->next = comment;
- }
-
- return comment;
- }
- /* This function frees a bracketed comment list. */
- static void s_BracketedCommentListFree (TBracketedCommentListPtr list)
- {
- if (list == NULL) {
- return;
- }
- s_BracketedCommentListFree (list->next);
- list->next = NULL;
- s_LineInfoFree (list->comment_lines);
- }
- /* This function adds a line to a bracketed comment. */
- static void s_BracketedCommentListAddLine
- (TBracketedCommentListPtr comment,
- char * string,
- int line_num,
- int line_offset)
- {
- if (comment == NULL) {
- return;
- }
- comment->comment_lines = s_AddLineInfo (comment->comment_lines, string, line_num, line_offset);
- }
- /* This function counts the sequences found in a bracketed comment. */
- static int s_CountSequencesInBracketedComment (TBracketedCommentListPtr comment)
- {
- TLineInfoPtr lip;
- int num_segments = 0;
- EBool skipped_line_since_last_defline = eTrue;
-
- if (comment == NULL || comment->comment_lines == NULL) {
- return 0;
- }
-
- lip = comment->comment_lines;
- /* First line must be left bracket on a line by itself */
- if (lip->data[0] != '[' || strspn (lip->data + 1, " trn") != strlen (lip->data + 1))
- {
- return 0;
- }
- lip = lip->next;
- while (lip != NULL && lip->next != NULL)
- {
- if (lip->data[0] == '>')
- {
- if (!skipped_line_since_last_defline)
- {
- return 0;
- }
- else
- {
- num_segments ++;
- skipped_line_since_last_defline = eFalse;
- }
- }
- else
- {
- skipped_line_since_last_defline = eTrue;
- }
- lip = lip->next;
- }
- /* Last line must be right bracket on a line by itself */
- /* First line must be left bracket on a line by itself */
- if (lip->data[0] != ']' || strspn (lip->data + 1, " trn") != strlen (lip->data + 1))
- {
- return 0;
- }
-
- return num_segments;
- }
- /* This function counts the number of sequences that appear in
- * bracketed comments. If the number of sequences is inconsistent,
- * the function will issue error messages and return a 1, otherwise
- * the function will return the number of sequences that appear in
- * each bracketed comment.
- */
- static int s_GetNumSegmentsInAlignment
- (TBracketedCommentListPtr comment_list,
- FReportErrorFunction errfunc,
- void * errdata)
- {
- TBracketedCommentListPtr comment;
- TSizeInfoPtr segcount_list = NULL;
- int num_segments = 1;
- int num_segments_this_bracket;
- int num_segments_expected;
- TSizeInfoPtr best;
-
- if (comment_list == NULL)
- {
- return num_segments;
- }
-
- for (comment = comment_list; comment != NULL; comment = comment->next)
- {
- num_segments_this_bracket = s_CountSequencesInBracketedComment (comment);
- segcount_list = s_AddSizeInfoAppearances (segcount_list,
- num_segments_this_bracket,
- 1);
- if (comment != comment_list && segcount_list->next != NULL)
- {
- best = s_GetMostPopularSizeInfo (segcount_list);
- num_segments_expected = best->size_value;
- if (num_segments_expected != num_segments_this_bracket)
- {
- s_ReportBadNumSegError (comment->comment_lines->line_num,
- num_segments_this_bracket, num_segments_expected,
- errfunc, errdata);
- }
- }
- }
- if (segcount_list != NULL && segcount_list->next == NULL && segcount_list->size_value > 0)
- {
- num_segments = segcount_list->size_value;
- }
- s_SizeInfoFree (segcount_list);
- return num_segments;
- }
- /* This function gets a list of the offsets of the
- * sequences in bracketed comments.
- */
- static TIntLinkPtr GetSegmentOffsetList (TBracketedCommentListPtr comment_list)
- {
- TIntLinkPtr new_offset, offset_list = NULL;
- TBracketedCommentListPtr comment;
- TLineInfoPtr lip;
- if (comment_list == NULL)
- {
- return NULL;
- }
-
- for (comment = comment_list; comment != NULL; comment = comment->next)
- {
- if (s_CountSequencesInBracketedComment (comment) == 0)
- {
- continue;
- }
- for (lip = comment->comment_lines; lip != NULL; lip = lip->next)
- {
- if (lip->data != NULL && lip->data[0] == '>')
- {
- new_offset = s_IntLinkNew (lip->line_num + 1, offset_list);
- if (offset_list == NULL) offset_list = new_offset;
- }
- }
- }
- return offset_list;
- }
- static char * s_TokenizeString (char * str, char *delimiter, char **last)
- {
- int skip;
- int length;
- if (str == NULL) {
- str = *last;
- }
- if (delimiter == NULL) {
- *last = NULL;
- return NULL;
- }
- if (str == NULL || *str == 0) {
- return NULL;
- }
- skip = strspn (str, delimiter);
- str += skip;
- length = strcspn (str, delimiter);
- *last = str + length;
- if (**last != 0) {
- **last = 0;
- (*last) ++;
- }
- return str;
- }
-
- /* This function creates a new list of SLineInfo structures by tokenizing
- * each data element from line_list into multiple tokens at whitespace.
- * The function returns a pointer to the new list. The original list is
- * unchanged.
- */
- static TLineInfoPtr s_BuildTokenList (TLineInfoPtr line_list)
- {
- TLineInfoPtr first_token, lip;
- char * tmp;
- char * piece;
- char * last;
- int line_pos;
- first_token = NULL;
- for (lip = line_list; lip != NULL; lip = lip->next) {
- if (lip->data != NULL && (tmp = strdup (lip->data)) != NULL) {
- piece = s_TokenizeString (tmp, " tr", &last);
- while (piece != NULL) {
- line_pos = piece - tmp;
- line_pos += lip->line_offset;
- first_token = s_AddLineInfo (first_token, piece,
- lip->line_num,
- line_pos);
- piece = s_TokenizeString (NULL, " tr", &last);
- }
- free (tmp);
- }
- }
- return first_token;
- }
- /* This function takes a list of SLineInfo structures, allocates memory
- * to hold their contents contiguously, and stores their contents, minus
- * the whitespace, in the newly allocated memory.
- * The function returns a pointer to this newly allocated memory.
- */
- static char * s_LineInfoMergeAndStripSpaces (TLineInfoPtr list)
- {
- TLineInfoPtr lip;
- int len;
- char * result;
- char * cp_to;
- char * cp_from;
- if (list == NULL) {
- return NULL;
- }
- len = 0;
- for (lip = list; lip != NULL; lip = lip->next) {
- if (lip->data != NULL) {
- len += strlen (lip->data);
- }
- }
- result = (char *) malloc (len + 1);
- if (result == NULL) {
- return result;
- }
- cp_to = result;
- for (lip = list; lip != NULL; lip = lip->next) {
- if (lip->data != NULL) {
- cp_from = lip->data;
- while (*cp_from != 0) {
- if (! isspace ((int )*cp_from)) {
- *cp_to = *cp_from;
- cp_to ++;
- }
- cp_from ++;
- }
- }
- }
- *cp_to = 0;
- return result;
- }
- /* The following functions are used to manage the SLineInfoReader
- * structure. The intention is to allow the user to access the data
- * from a linked list of SLineInfo structures using a given position
- * in the data based on the number of sequence data characters rather than
- * any particular line number or position in the line. This is useful
- * for matching up a data position in a record with a match character with
- * the same data position in the first or master record. This is also useful
- * for determining how to interpret special characters that may have
- * context-sensitive meanings. For example, a ? could indicate a missing
- * character if it is inside a sequence but indicate a gap if it is outside
- * a sequence.
- */
- /* This function is used to advance the current data position pointer
- * for a SLineInfoReader structure past white space and blank lines
- * in sequence data.
- */
- static void s_LineInfoReaderAdvancePastSpace (TLineInfoReaderPtr lirp)
- {
- if (lirp->curr_line_pos == NULL) {
- return;
- }
- while ( isspace ((int ) *lirp->curr_line_pos)
- || *lirp->curr_line_pos == 0) {
- while ( isspace ((int )*lirp->curr_line_pos)) {
- lirp->curr_line_pos ++;
- }
- if (*lirp->curr_line_pos == 0) {
- lirp->curr_line = lirp->curr_line->next;
- while (lirp->curr_line != NULL
- && lirp->curr_line->data == NULL) {
- lirp->curr_line = lirp->curr_line->next;
- }
- if (lirp->curr_line == NULL) {
- lirp->curr_line_pos = NULL;
- return;
- } else {
- lirp->curr_line_pos = lirp->curr_line->data;
- }
- }
- }
- }
- /* This function sets the current data position pointer to the first
- * non-whitespace character in the sequence data.
- */
- static void s_LineInfoReaderReset (TLineInfoReaderPtr lirp)
- {
- if (lirp == NULL) {
- return;
- }
- lirp->curr_line = lirp->first_line;
- while (lirp->curr_line != NULL && lirp->curr_line->data == NULL) {
- lirp->curr_line = lirp->curr_line->next;
- }
- if (lirp->curr_line == NULL) {
- lirp->curr_line_pos = NULL;
- lirp->data_pos = -1;
- } else {
- lirp->curr_line_pos = lirp->curr_line->data;
- s_LineInfoReaderAdvancePastSpace (lirp);
- if (lirp->curr_line_pos == NULL) {
- lirp->data_pos = -1;
- } else {
- lirp->data_pos = 0;
- }
- }
- }
-
- /* This function creates a new SLineInfoReader structure and initializes
- * its member variables. The current data position pointer is set to the
- * first non-whitespace character in the sequence data, and the data position
- * counter is set to zero. The function returns a pointer to the new
- * LineInfoReader data structure.
- */
- static TLineInfoReaderPtr s_LineInfoReaderNew (TLineInfoPtr line_list)
- {
- TLineInfoReaderPtr lirp;
- if (line_list == NULL) {
- return NULL;
- }
- lirp = (TLineInfoReaderPtr) malloc (sizeof (SLineInfoReader));
- if (lirp == NULL) {
- return NULL;
- }
- lirp->first_line = line_list;
- s_LineInfoReaderReset (lirp);
- return lirp;
- }
- /* This function safely interprets the current line number of the
- * SLineInfoReader structure. If the structure is NULL or the
- * current line is NULL (usually because the data position has been
- * advanced to the end of the available sequence data), the function
- * returns -1, since the current data position does not actually exist.
- * Otherwise, the line number of the character at the current data position
- * is returned.
- */
- static int s_LineInfoReaderGetCurrentLineNumber (TLineInfoReaderPtr lirp)
- {
- if (lirp == NULL || lirp->curr_line == NULL) {
- return -1;
- } else {
- return lirp->curr_line->line_num;
- }
- }
- /* This function safely interprets the position of the current data position
- * of the SLineInfoReader structure. If the structure is NULL or the
- * current line is NULL or the current line position is NULL (usually because
- * the data position has been advanced to the end of the available sequence
- * data), the function returns -1, since the current data position does not
- * actually exist.
- * Otherwise, the position within the line of the character at the current
- * data position is returned.
- */
- static int s_LineInfoReaderGetCurrentLineOffset (TLineInfoReaderPtr lirp)
- {
- if (lirp == NULL || lirp->curr_line == NULL
- || lirp->curr_line_pos == NULL) {
- return -1;
- } else {
- return lirp->curr_line->line_offset + lirp->curr_line_pos
- - lirp->curr_line->data;
- }
- }
- /* This function frees the memory associated with the SLineInfoReader
- * structure. Notice that this function does NOT free the SLineInfo list.
- * This is by design.
- */
- static void s_LineInfoReaderFree (TLineInfoReaderPtr lirp)
- {
- if (lirp == NULL) {
- return;
- }
- free (lirp);
- lirp = NULL;
- }
- /* This function retrieves the "pos"th sequence data character from the lines
- * of sequence data. If the data position requested is greater than the
- * current position, the current data pointer will be advanced until the
- * current position is the requested position or there is no more data. If
- * there is no more data, the function returns a 0. If the data position
- * requested is lower than the current position, the current position is reset
- * to the beginning of the sequence and advanced from there.
- * As a result, it is clearly more efficient to read the data in the forward
- * direction, but it is still possible to access the data randomly.
- */
- static char
- s_FindNthDataChar
- (TLineInfoReaderPtr lirp,
- int pos)
- {
- if (lirp == NULL || lirp->first_line == NULL || pos < 0
- || lirp->data_pos == -1) {
- return 0;
- }
- if (lirp->data_pos == pos) {
- if (lirp->curr_line_pos == NULL) {
- return 0;
- } else {
- return *lirp->curr_line_pos;
- }
- }
- if (lirp->data_pos > pos) {
- s_LineInfoReaderReset (lirp);
- }
-
- while (lirp->data_pos < pos && lirp->curr_line != NULL) {
- lirp->curr_line_pos ++;
- /* skip over spaces, progress to next line if necessary */
- s_LineInfoReaderAdvancePastSpace (lirp);
- lirp->data_pos ++;
- }
- if (lirp->curr_line_pos != NULL) {
- return *lirp->curr_line_pos;
- } else {
- return 0;
- }
- }
- /* The following functions are used to manage the SStringCount structure.
- * These functions are useful for determining whether a string is unique
- * or whether only one string is used for a particular purpose.
- * The structure also tracks the line numbers on which a particular string
- * appeared.
- */
- /* This function allocates memory for a new SStringCount structure,
- * initializes its member variables. The function also places the
- * structure at the end of list if list is not NULL.
- * The function returns a pointer to the newly allocated SStringCount
- * structure.
- */
- static TStringCountPtr s_StringCountNew (TStringCountPtr list)
- {
- TStringCountPtr new_item, last;
- new_item = (TStringCountPtr) malloc (sizeof (SStringCount));
- if (new_item == NULL) {
- return NULL;
- }
- new_item->string = NULL;
- new_item->num_appearances = 0;
- new_item->line_numbers = NULL;
- new_item->next = NULL;
- last = list;
- while (last != NULL && last->next != NULL) {
- last = last->next;
- }
- if (last != NULL) {
- last->next = new_item;
- }
- return new_item;
- }
- /* This function recursively frees data associated with the structures
- * and structure member variables in a linked list of SStringCount
- * structures.
- */
- static void s_StringCountFree (TStringCountPtr list)
- {
- if (list == NULL) {
- return;
- }
- s_StringCountFree (list->next);
- s_IntLinkFree (list->line_numbers);
- free (list);
- }
- /* This function searches list to see if the string matches any of the
- * existing entries. If so, the num_appearances value for that entry is
- * increased and the line_num is added to that entry's list of line numbers.
- * Otherwise a new entry is created at the end of the list.
- * The function returns list if list was not NULL, or a pointer to the
- * newly created SStringCount structure otherwise.
- */
- static TStringCountPtr s_AddStringCount (
- char * string,
- int line_num,
- TStringCountPtr list
- )
- {
- TStringCountPtr add_to, last;
- TIntLinkPtr new_offset;
- if (string == NULL) {
- for (add_to = list;
- add_to != NULL && add_to->string != NULL;
- add_to = add_to->next) {
- last = add_to;
- }
- } else {
- for (add_to = list;
- add_to != NULL
- && (add_to->string == NULL
- || strcmp (string, add_to->string) != 0);
- add_to = add_to->next) {
- last = add_to;
- }
- }
-
- if (add_to == NULL) {
- add_to = s_StringCountNew (list);
- if (list == NULL) list = add_to;
- if (add_to != NULL) {
- add_to->string = string;
- }
- }
- if (add_to != NULL) {
- add_to->num_appearances ++;
- new_offset = s_IntLinkNew (line_num, add_to->line_numbers);
- if (add_to->line_numbers == NULL) {
- add_to->line_numbers = new_offset;
- }
- }
- return list;
- }
- /* The following functions are replacements for strncasecmp and strcasecmp */
- /* This function returns -1 if str1 is less than str2 in the first cmp_count
- * characters (using case-insensitive comparisons), 0 if they are equal,
- * and 1 if str1 is greater than str2.
- */
- static int s_StringNICmp (char * str1, char *str2, int cmp_count)
- {
- char * cp1;
- char * cp2;
- int char_count, diff;
- if (str1 == NULL && str2 == NULL) {
- return 0;
- }
- if (str1 == NULL) {
- return -1;
- }
- if (str2 == NULL) {
- return 1;
- }
- cp1 = str1;
- cp2 = str2;
- char_count = 0;
- while (*cp1 != 0 && *cp2 != 0 && char_count < cmp_count) {
- diff = toupper ((int) *cp1) - toupper ((int) *cp2);
- if (diff != 0) {
- return diff;
- }
- char_count ++;
- cp1++;
- cp2++;
- }
- if (char_count == cmp_count) {
- return 0;
- } else if (*cp1 == 0 && *cp2 != 0) {
- return -1;
- } else if (*cp1 != 0 && *cp2 == 0) {
- return 1;
- } else {
- return 0;
- }
- }
- /* This function returns -1 if str1 is less than str2 using case-insensitive
- * comparisons), 0 if they are equal, and 1 if str1 is greater than str2.
- */
- static int s_StringICmp (char * str1, char *str2)
- {
- char * cp1;
- char * cp2;
- int diff;
- if (str1 == NULL && str2 == NULL) {
- return 0;
- }
- if (str1 == NULL) {
- return -1;
- }
- if (str2 == NULL) {
- return 1;
- }
- cp1 = str1;
- cp2 = str2;
- while (*cp1 != 0 && *cp2 != 0) {
- diff = toupper ((int) *cp1) - toupper ((int) *cp2);
- if (diff != 0) {
- return diff;
- }
- cp1++;
- cp2++;
- }
- if (*cp1 == 0 && *cp2 != 0) {
- return -1;
- } else if (*cp1 != 0 && *cp2 == 0) {
- return 1;
- } else {
- return 0;
- }
- }
- /* The following functions are used to analyze specific kinds of lines
- * found in alignment files for information regarding the number of
- * expected sequences, the expected length of those sequences, and the
- * characters used to indicate missing, gap, and match characters.
- */
- /* This function reads two numbers separated by whitespace from the
- * beginning of the string and uses them to set the expected number of
- * sequences and the expected number of characters per sequence.
- */
- static void
- s_GetFASTAExpectedNumbers
- (char * str,
- SAlignRawFilePtr afrp)
- {
- char * cp;
- char * cpend;
- char replace;
- int first, second;
- if (str == NULL || afrp == NULL) {
- return;
- }
- cp = str;
- while (! isdigit ((int )*cp) && *cp != 0) {
- cp++;
- }
- cpend = cp;
- while (isdigit ((int )*cpend) && *cpend != 0) {
- cpend++;
- }
- if (cp == cpend) {
- return;
- }
- replace = *cpend;
- *cpend = 0;
- first = atol (cp);
- *cpend = replace;
- cp = cpend;
- while (! isdigit ((int )*cp) && *cp != 0) {
- cp++;
- }
- cpend = cp;
- while (isdigit ((int )*cpend) && *cpend != 0) {
- cpend++;
- }
- if (cp == cpend) {
- return;
- }
- replace = *cpend;
- *cpend = 0;
- second = atol (cp);
- *cpend = replace;
- if (first > 0 && second > 0) {
- afrp->expected_num_sequence = first;
- afrp->expected_sequence_len = second;
- }
-
- }
- /* This function examines the string str to see if it begins with two
- * numbers separated by whitespace. The function returns eTrue if so,
- * otherwise it returns eFalse.
- */
- static EBool s_IsTwoNumbersSeparatedBySpace (char * str)
- {
- char * cp;
- EBool found_first_number = eFalse;
- EBool found_dividing_space = eFalse;
- EBool found_second_number = eFalse;
- EBool found_second_number_end = eFalse;
- if (str == NULL) {
- return eFalse;
- }
- cp = str;
- while (*cp != 0) {
- if (! isdigit ((int )*cp) && ! isspace ((int )*cp)) {
- return eFalse;
- }
- if (! found_first_number) {
- if (! isdigit ((int )*cp)) {
- return eFalse;
- }
- found_first_number = eTrue;
- } else if (! found_dividing_space) {
- if ( isspace ((int ) *cp)) {
- found_dividing_space = eTrue;
- } else if ( ! isdigit ((int )*cp)) {
- return eFalse;
- }
- } else if (! found_second_number) {
- if ( isdigit ((int )*cp)) {
- found_second_number = eTrue;
- } else if (! isspace ((int ) *cp)) {
- return eFalse;
- }
- } else if (! found_second_number_end) {
- if ( isspace ((int ) *cp)) {
- found_second_number_end = eTrue;
- } else if (! isdigit ((int )*cp)) {
- return eFalse;
- }
- } else if (! isspace ((int ) *cp)) {
- return eFalse;
- }
- cp++;
- }
- if (found_second_number) {
- return eTrue;
- }
- return eFalse;
- }
- /* This function finds a value name in a string, looks for an equals sign
- * after the value name, and then looks for an integer value after the
- * equals sign. If the integer value is found, the function copies the
- * integer value into the val location and returns eTrue, otherwise the
- * function returns eFalse.
- */
- static EBool
- s_GetOneNexusSizeComment
- (char * str,
- char * valname,
- int * val)
- {
- char buf[MAX_PRINTED_INT_LEN_PLUS_ONE];
- char * cpstart;
- char * cpend;
- int maxlen;
- if (str == NULL || valname == NULL || val == NULL) {
- return eFalse;
- }
- cpstart = strstr (str, valname);
- if (cpstart == NULL) {
- return eFalse;
- }
- cpstart += strlen (valname);
- while (*cpstart != 0 && isspace ((int )*cpstart)) {
- cpstart++;
- }
- if (*cpstart != '=') {
- return eFalse;
- }
- cpstart ++;
- while (*cpstart != 0 && isspace ((int )*cpstart)) {
- cpstart++;
- }
- if (! isdigit ((int )*cpstart)) {
- return eFalse;
- }
- cpend = cpstart + 1;
- while ( *cpend != 0 && isdigit ((int )*cpend)) {
- cpend ++;
- }
- maxlen = cpend - cpstart;
- if (maxlen > kMaxPrintedIntLen) maxlen = kMaxPrintedIntLen;
- strncpy (buf, cpstart, maxlen);
- buf [maxlen] = 0;
- *val = atoi (buf);
- return eTrue;
- }
- /* This function looks for Nexus-style comments to indicate the number of
- * sequences and the number of characters per sequence expected from this
- * alignment file. If the function finds these comments, it returns eTrue,
- * otherwise it returns eFalse.
- */
- static void
- s_GetNexusSizeComments
- (char * str,
- EBool * found_ntax,
- EBool * found_nchar,
- SAlignRawFilePtr afrp)
- {
- int num_sequences;
- int num_chars;
-
- if (str == NULL || found_nchar == NULL
- || found_ntax == NULL || afrp == NULL) {
- return;
- }
- if (! *found_ntax &&
- (s_GetOneNexusSizeComment (str, "ntax", &num_sequences)
- || s_GetOneNexusSizeComment (str, "NTAX", &num_sequences))) {
- afrp->expected_num_sequence = num_sequences;
- *found_ntax = eTrue;
- }
- if (! *found_nchar &&
- (s_GetOneNexusSizeComment (str, "nchar", &num_chars)
- || s_GetOneNexusSizeComment (str, "NCHAR", &num_chars))) {
- afrp->expected_sequence_len = num_chars;
- *found_nchar = eTrue;
- }
- }
- /* This function looks for characters in Nexus-style comments to
- * indicate values for specific kinds of characters (match, missing, gap...).
- * If the string str contains val_name followed by an equals sign, the function
- * will return the first non-whitespace character following the equals sign,
- * otherwise the function will return a 0.
- */
- static char GetNexusTypechar (char * str, char * val_name)
- {
- char * cp;
- char * cpend;
- if (str == NULL || val_name == NULL) {
- return 0;
- }
- cpend = strstr (str, ";");
- if (cpend == NULL) {
- return 0;
- }
- cp = strstr (str, val_name);
- if (cp == NULL || cp > cpend) {
- return 0;
- }
- cp += strlen (val_name);
- while ( isspace ((int )*cp)) {
- cp ++;
- }
- if (*cp != '=') {
- return 0;
- }
- cp++;
- while ( isspace ((int )*cp) || *cp == ''') {
- cp ++;
- }
- return *cp;
- }
- /* This function reads a Nexus-style comment line for the characters
- * specified for missing, match, and gap and compares the characters from
- * the comment with the characters specified in sequence_info. If any
- * discrepancies are found, the function reports the errors and returns eFalse,
- * otherwise the function returns eTrue.
- */
- static EBool s_CheckNexusCharInfo
- (char * str,
- TSequenceInfoPtr sequence_info,
- FReportErrorFunction errfunc,
- void * errdata)
- {
- char * cp;
- char c;
- if (str == NULL || sequence_info == NULL) {
- return eFalse;
- }
- cp = strstr (str, "format ");
- if (cp == NULL) {
- cp = strstr (str, "FORMAT ");
- }
- if (cp == NULL) {
- return eFalse;
- }
- if (errfunc == NULL) {
- return eTrue;
- }
- c = GetNexusTypechar (cp + 7, "missing");
- if (c == 0) {
- c = GetNexusTypechar (cp + 7, "MISSING");
- }
- if (c != 0 && sequence_info->missing != NULL
- && strchr (sequence_info->missing, c) == NULL)
- {
- s_ReportCharCommentError (sequence_info->missing, c, "MISSING",
- errfunc, errdata);
- }
-
- c = GetNexusTypechar (cp + 7, "gap");
- if (c == 0) {
- c = GetNexusTypechar (cp + 7, "GAP");
- }
- if (c != 0 && sequence_info->middle_gap != NULL
- && strchr (sequence_info->middle_gap, c) == NULL)
- {
- s_ReportCharCommentError (sequence_info->middle_gap, c, "GAP",
- errfunc, errdata);
- }
-
- c = GetNexusTypechar (cp + 7, "match");
- if (c == 0) {
- c = GetNexusTypechar (cp + 7, "MATCH");
- }
- if (c != 0 && sequence_info->match != NULL
- && strchr (sequence_info->match, c) == NULL)
- {
- s_ReportCharCommentError (sequence_info->match, c, "MATCH",
- errfunc, errdata);
- }
- return eTrue;
- }
- /* This function examines the string str to see if it consists entirely of
- * asterisks, colons, periods, and whitespace. If so, this line is assumed
- * to be a Clustal-style consensus line and the function returns eTrue.
- * otherwise the function returns false;
- */
- static EBool s_IsConsensusLine (char * str)
- {
- if (str == NULL
- || strspn (str, "*:. trn") < strlen (str)
- || strchr (str, '*') == NULL) {
- return eFalse;
- } else {
- return eTrue;
- }
- }
- /* This function identifies lines that begin with a NEXUS keyword and end
- * with a semicolon - they will not contain sequence data. The function
- * returns eTrue if the line contains only a NEXUS comment, eFalse otherwise.
- */
- static EBool s_SkippableNexusComment (char *str)
- {
- char * last_semicolon;
- if (str == NULL) {
- return eFalse;
- }
- last_semicolon = strrchr (str, ';');
- if (last_semicolon == NULL
- || strspn (last_semicolon + 1, " tr") != strlen (last_semicolon + 1)
- || strchr (str, ';') != last_semicolon) {
- return eFalse;
- }
- if (s_StringNICmp (str, "format ", 7) == 0
- || s_StringNICmp (str, "dimensions ", 11) == 0
- || s_StringNICmp (str, "dimensions ", 11) == 0
- || s_StringNICmp (str, "options ", 8) == 0
- || s_StringNICmp (str, "begin characters", 16) == 0
- || s_StringNICmp (str, "begin data", 10) == 0) {
- return eTrue;
- } else {
- return eFalse;
- }
- }
- /* This function determines whether the contents of str are "skippable"
- * in that they do not contain sequence data and therefore should not be
- * considered part of any block patterns or sequence data.
- */
- static EBool s_SkippableString (char * str)
- {
- if (str == NULL
- || s_StringNICmp (str, "matrix", 6) == 0
- || s_StringNICmp (str, "#NEXUS", 6) == 0
- || s_StringNICmp (str, "CLUSTAL W", 8) == 0
- || s_SkippableNexusComment (str)
- || s_IsTwoNumbersSeparatedBySpace (str)
- || s_IsConsensusLine (str)
- || str [0] == ';') {
- return eTrue;
- } else {
- return eFalse;
- }
- }
- /* This function determines whether or not str contains a blank line.
- */
- static EBool s_IsBlank (char * str)
- {
- size_t len;
- if (str == NULL) {
- return eTrue;
- }
- len = strspn (str, " tr");
- if (len == strlen (str)) {
- return eTrue;
- }
- return eFalse;
- }
- /* This function determines whether or not linestring contains a line
- * indicating the end of sequence data (organism information and definition
- * lines may occur after this line).
- */
- static EBool s_FoundStopLine (char * linestring)
- {
- if (linestring == NULL) {
- return eFalse;
- }
- if (s_StringNICmp (linestring, "endblock", 8) == 0
- || s_StringNICmp (linestring, "end;", 4) == 0) {
- return eTrue;
- }
- return eFalse;
- }
- /* This function identifies the beginning line of an ASN.1 file, which
- * cannot be read by the alignment reader.
- */
- static EBool s_IsASN1 (char * linestring)
- {
- if (linestring != NULL && strstr (linestring, "::=") != NULL) {
- return eTrue;
- } else {
- return eFalse;
- }
- }
- /* The following functions are used to locate and read comments enclosed
- * in brackets. These comments sometimes include organism information.
- */
- /* This function frees memory associated with a SCommentLoc structure. */
- static void s_CommentLocFree (TCommentLocPtr clp)
- {
- if (clp == NULL) {
- return;
- }
- s_CommentLocFree (clp->next);
- free (clp);
- }
- /* This function finds the first comment enclosed in brackets and creates
- * a SCommentLoc structure to indicate the position of the comment
- * in the string. The function returns a pointer to this structure if a
- * comment is found or a NULL if the string does not contain a bracketed
- * comment.
- */
- static TCommentLocPtr s_FindComment (char * string)
- {
- char * cp_start;
- char * cp_end;
- TCommentLocPtr clp;
- if (string == NULL) {
- return NULL;
- }
- cp_start = strstr (string, "[");
- if (cp_start != NULL) {
- cp_end = strstr (cp_start, "]");
- if (cp_end != NULL) {
- clp = (TCommentLocPtr) malloc (sizeof (SCommentLoc));
- if (clp == NULL) {
- return NULL;
- }
- clp->start = cp_start;
- clp->end = cp_end;
- clp->next = NULL;
- return clp;
- }
- }
- return NULL;
- }
- /* This function removes a comment from a line. */
- static void s_RemoveCommentFromLine (char * linestring)
- {
- TCommentLocPtr clp;
- if (linestring == NULL) {
- return;
- }
- clp = s_FindComment (linestring);
- while (clp != NULL) {
- strcpy (clp->start, clp->end + 1);
- s_CommentLocFree (clp);
- clp = s_FindComment (linestring);
- }
- /* if we have read an organism comment and that's all there was on the
- * line, get rid of the arrow character as well so it doesn't end up
- * in the sequence data
- */
- if ( linestring [0] == '>' && linestring [1] == 0) {
- linestring [0] = 0;
- }
- /* if the line now contains only space, truncate it */
- if (strspn (linestring, " tr") == strlen (linestring)) {
- linestring [0] = 0;
- }
-
- }
- /* This function determines whether or not a comment describes an organism
- * by looking for org= or organism= inside the brackets.
- */
- static EBool s_IsOrganismComment (TCommentLocPtr clp)
- {
- int len;
- char * cp;
- char * cp_end;
- if (clp == NULL || clp->start == NULL || clp->end == NULL) {
- return eFalse;
- }
-
- cp = clp->start;
- if (*cp != '[') {
- return eFalse;
- }
- cp ++;
- len = strspn ( clp->start, " tr");
- cp = cp + len;
- cp_end = strstr (cp, "=");
- if (cp_end == NULL) {
- return eFalse;
- }
- cp_end --;
- while (cp_end > cp && isspace ((int )*cp_end)) {
- cp_end --;
- }
- cp_end ++;
- if ((cp_end - cp == 3 && s_StringNICmp (cp, "org", 3) == 0)
- || (cp_end - cp == 8 && s_StringNICmp (cp, "organism", 8) == 0)) {
- return eTrue;
- }
- return eFalse;
- }
- /* This function finds an organism comment, which includes the first bracketed
- * comment with org= or organism=, plus any additional bracketed comments
- * separated only by whitespace from the org= or organism= comment.
- * The function returns a pointer to a SCommentLoc structure describing
- * the location of the organism comment.
- */
- static TCommentLocPtr s_FindOrganismComment (char * string)
- {
- TCommentLocPtr clp, next_clp;
- if (string == NULL) {
- return NULL;
- }
- clp = s_FindComment (string);
- while (clp != NULL && ! s_IsOrganismComment (clp)) {
- clp = s_FindComment (clp->end);
- }
- if (clp == NULL) {
- return NULL;
- }
- next_clp = s_FindComment (clp->end);
- while (next_clp != NULL &&
- (int) strspn (clp->end + 1, " tr") == next_clp->start - clp->end - 1
- && ! s_IsOrganismComment (next_clp))
- {
- clp->end = next_clp->end;
- next_clp = s_FindComment (clp->end);
- }
- return clp;
- }
- /* This function removes an organism comment from a line. */
- static void s_RemoveOrganismCommentFromLine (char * string)
- {
- TCommentLocPtr clp;
- while ((clp = s_FindOrganismComment (string)) != NULL) {
- strcpy (clp->start, clp->end + 1);
- s_CommentLocFree (clp);
- }
- }
-
- /* This function creates an ordered list of comments within an organism
- * comment and returns a pointer to the first item in the linked list.
- * In an ordered org name, the org= value appears first, followed by other
- * bracketed values in alphabetical order.
- */
- static TCommentLocPtr s_CreateOrderedOrgCommentList (TCommentLocPtr org_clp)
- {
- TCommentLocPtr clp, prev_clp, next_clp, clp_list, ordered_start;
- int next_len, this_len, len;
-
- if (org_clp == NULL) {
- return NULL;
- }
- clp_list = s_FindComment (org_clp->start); /* this is the org= */
- prev_clp = NULL;
- ordered_start = s_FindComment (clp_list->end);
- if (ordered_start == NULL) {
- return clp_list;
- }
- clp = s_FindComment (ordered_start->end);
- while (clp != NULL && clp->start < org_clp->end) {
- /* insert new comment into list */
- prev_clp = NULL;
- next_clp = ordered_start;
- next_len = next_clp->end - next_clp->start;
- this_len = clp->end - clp->start;
- len = next_len > this_len ? next_len : this_len;
- while (next_clp != NULL
- && strncmp (next_clp->start, clp->start, len) < 0)
- {
- prev_clp = next_clp;
- next_clp = next_clp->next;
- if (next_clp != NULL) {
- next_len = next_clp->end - next_clp->start;
- len = next_len > this_len ? next_len : this_len;
- }
- }
- if (prev_clp == NULL) {
- clp->next = ordered_start;
- ordered_start = clp;
- } else {
- clp->next = prev_clp->next;
- prev_clp->next = clp;
- }
- clp = s_FindComment (clp->end);
- }
- clp_list->next = ordered_start;
- return clp_list;
- }
- /* This function creates an ordered organism name based on the bracketed
- * comments contained in the location described by org_clp.
- */
- static char * s_CreateOrderedOrgName (TCommentLocPtr org_clp)
- {
- TCommentLocPtr clp, clp_list;
- char * ordered_org_name;
- char * cp;
- if (org_clp == NULL) {
- return NULL;
- }
- ordered_org_name = malloc (org_clp->end - org_clp->start + 2);
- if (ordered_org_name == NULL) {
- return NULL;
- }
- ordered_org_name [0] = 0;
- clp_list = s_CreateOrderedOrgCommentList (org_clp);
- cp = ordered_org_name;
- for (clp = clp_list; clp != NULL; clp = clp->next) {
- strncpy (cp, clp->start, clp->end - clp->start + 1);
- cp += clp->end - clp->start + 1;
- *cp = 0;
- }
-
- s_CommentLocFree (clp_list);
- return ordered_org_name;
- }
- /* This function is used to read any organism names that may appear in
- * string, including any modifiers that may appear after the organism name.
- */
- static void s_ReadOrgNamesFromText
- (char * string,
- int line_num,
- SAlignRawFilePtr afrp)
- {
- TCommentLocPtr clp;
- char * org_name;
- char * cp;
- char * defline;
- char * comment_end;
- int defline_offset;
-
- if (string == NULL || afrp == NULL) {
- return;
- }
- clp = s_FindOrganismComment (string);
- if (clp == NULL && (strstr (string, "org=") != NULL || strstr (string, "organism=") != NULL))
- {
- s_ReportOrgCommentError (string, afrp->report_error, afrp->report_error_userdata);
- }
- while (clp != NULL) {
- org_name = s_CreateOrderedOrgName (clp);
- afrp->organisms = s_AddLineInfo (afrp->organisms, org_name, line_num,
- clp->start - string);
- free (org_name);
- afrp->num_organisms ++;
- defline = NULL;
- defline_offset = 0;
- if (*clp->end != 0) {
- cp = clp->end + 1;
- cp += strspn (cp, " trn");
- if (*cp != 0) {
- defline = clp->end + 1;
- defline_offset = clp->end - string + 1;
- }
- }
- afrp->deflines = s_AddLineInfo (afrp->deflines, defline, line_num,
- defline_offset);
- afrp->num_deflines ++;
-
- comment_end = clp->end;
- s_CommentLocFree (clp);
- clp = s_FindOrganismComment (comment_end);
- }
- }
- /* The following group of functions manages the SAlignRawSeq structure,
- * which is used to track the IDs of sequences in the file, the sequence
- * characters for those IDs, and the locations of the IDs and sequence
- * characters.
- */
- /* This function allocates memory for an SAlignRawSeq structure,
- * initializes its member variables, and returns a pointer to the newly
- * allocated structure.
- */
- static TAlignRawSeqPtr s_AlignRawSeqNew (TAlignRawSeqPtr list)
- {
- TAlignRawSeqPtr arsp, last;
- arsp = (TAlignRawSeqPtr)malloc (sizeof (SAlignRawSeq));
- if (arsp == NULL) {
- return NULL;
- }
- arsp->id = NULL;
- arsp->sequence_data = NULL;
- arsp->id_lines = NULL;
- arsp->next = NULL;
- last = list;
- while (last != NULL && last->next != NULL) {
- last = last->next;
- }
- if (last != NULL) {
- last->next = arsp;
- }
- return arsp;
- }
- /* This function frees the memory associated with an SAlignRawSeq
- * structure's member variables and with the structure itself.
- */
- static void s_AlignRawSeqFree (TAlignRawSeqPtr arsp)
- {
- if (arsp == NULL) {
- return;
- }
- s_AlignRawSeqFree (arsp->next);
- free (arsp->id);
- s_LineInfoFree (arsp->sequence_data);
- s_IntLinkFree (arsp->id_lines);
- }
- /* This function returns a pointer to the sequence in list with the specified
- * ID, unless there is no such sequence, in which case the function returns
- * NULL.
- */
- static TAlignRawSeqPtr
- s_FindAlignRawSeqById
- (TAlignRawSeqPtr list,
- char * id)
- {
- TAlignRawSeqPtr arsp;
- for (arsp = list; arsp != NULL; arsp = arsp->next) {
- if (strcmp (arsp->id, id) == 0) {
- return arsp;
- }
- }
- return NULL;
- }
- /* This function finds the position of a given ID in the sequence list,
- * unless the ID is not found in the list, in which case the function returns
- * -1.
- */
- static int
- s_FindAlignRawSeqOffsetById
- (TAlignRawSeqPtr list,
- char * id)
- {
- TAlignRawSeqPtr arsp;
- int offset;
- for (arsp = list, offset = 0; arsp != NULL; arsp = arsp->next, offset++) {
- if (strcmp (arsp->id, id) == 0) {