sequence.cpp
上传用户:yhdzpy8989
上传日期:2007-06-13
资源大小:13604k
文件大小:137k
- case eOverlap_CheckIntervals:
- case eOverlap_Interval:
- revert_locations = true;
- // there's no break here - proceed to "default"
- default:
- // Require intervals overlap
- annot_overlap_type = SAnnotSelector::eOverlap_Intervals;
- break;
- }
- CConstRef<CSeq_feat> feat_ref;
- int diff = -1;
- CFeat_CI feat_it(scope, loc, SAnnotSelector()
- .SetFeatType(feat_type)
- .SetFeatSubtype(feat_subtype)
- .SetOverlapType(annot_overlap_type)
- .SetResolveTSE());
- for ( ; feat_it; ++feat_it) {
- // treat subset as a special case
- int cur_diff = ( !revert_locations ) ?
- TestForOverlap(loc, feat_it->GetLocation(), overlap_type) :
- TestForOverlap(feat_it->GetLocation(), loc, overlap_type);
- if (cur_diff < 0)
- continue;
- if ( cur_diff < diff || diff < 0 ) {
- diff = cur_diff;
- feat_ref = &feat_it->GetMappedFeature();
- }
- }
- return feat_ref;
- }
- CConstRef<CSeq_feat> GetBestOverlappingFeat(const CSeq_loc& loc,
- CSeqFeatData::E_Choice feat_type,
- EOverlapType overlap_type,
- CScope& scope)
- {
- return x_GetBestOverlappingFeat(loc,
- feat_type, CSeqFeatData::eSubtype_any,
- overlap_type, scope);
- }
- CConstRef<CSeq_feat> GetBestOverlappingFeat(const CSeq_loc& loc,
- CSeqFeatData::ESubtype feat_type,
- EOverlapType overlap_type,
- CScope& scope)
- {
- return x_GetBestOverlappingFeat(loc,
- CSeqFeatData::GetTypeFromSubtype(feat_type), feat_type,
- overlap_type, scope);
- }
- CConstRef<CSeq_feat> GetOverlappingGene(const CSeq_loc& loc, CScope& scope)
- {
- return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_gene,
- eOverlap_Contains, scope);
- }
- CConstRef<CSeq_feat> GetOverlappingmRNA(const CSeq_loc& loc, CScope& scope)
- {
- return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_mRNA,
- eOverlap_Contains, scope);
- }
- CConstRef<CSeq_feat> GetOverlappingCDS(const CSeq_loc& loc, CScope& scope)
- {
- return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_cdregion,
- eOverlap_Contains, scope);
- }
- CConstRef<CSeq_feat> GetOverlappingPub(const CSeq_loc& loc, CScope& scope)
- {
- return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_pub,
- eOverlap_Contains, scope);
- }
- CConstRef<CSeq_feat> GetOverlappingSource(const CSeq_loc& loc, CScope& scope)
- {
- return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_biosrc,
- eOverlap_Contains, scope);
- }
- CConstRef<CSeq_feat> GetOverlappingOperon(const CSeq_loc& loc, CScope& scope)
- {
- return GetBestOverlappingFeat(loc, CSeqFeatData::eSubtype_operon,
- eOverlap_Contains, scope);
- }
- int SeqLocPartialCheck(const CSeq_loc& loc, CScope* scope)
- {
- unsigned int retval = 0;
- if (!scope) {
- return retval;
- }
- // Find first and last Seq-loc
- const CSeq_loc *first = 0, *last = 0;
- for ( CSeq_loc_CI loc_iter(loc); loc_iter; ++loc_iter ) {
- if ( first == 0 ) {
- first = &(loc_iter.GetSeq_loc());
- }
- last = &(loc_iter.GetSeq_loc());
- }
- if (!first) {
- return retval;
- }
- CSeq_loc_CI i2(loc, CSeq_loc_CI::eEmpty_Allow);
- for ( ; i2; ++i2 ) {
- const CSeq_loc* slp = &(i2.GetSeq_loc());
- switch (slp->Which()) {
- case CSeq_loc::e_Null:
- if (slp == first) {
- retval |= eSeqlocPartial_Start;
- } else if (slp == last) {
- retval |= eSeqlocPartial_Stop;
- } else {
- retval |= eSeqlocPartial_Internal;
- }
- break;
- case CSeq_loc::e_Int:
- {
- const CSeq_interval& itv = slp->GetInt();
- if (itv.IsSetFuzz_from()) {
- const CInt_fuzz& fuzz = itv.GetFuzz_from();
- if (fuzz.Which() == CInt_fuzz::e_Lim) {
- CInt_fuzz::ELim lim = fuzz.GetLim();
- if (lim == CInt_fuzz::eLim_gt) {
- retval |= eSeqlocPartial_Limwrong;
- } else if (lim == CInt_fuzz::eLim_lt ||
- lim == CInt_fuzz::eLim_unk) {
- if (itv.IsSetStrand() &&
- itv.GetStrand() == eNa_strand_minus) {
- if (slp == last) {
- retval |= eSeqlocPartial_Stop;
- } else {
- retval |= eSeqlocPartial_Internal;
- }
- if (itv.GetFrom() != 0) {
- if (slp == last) {
- retval |= eSeqlocPartial_Nostop;
- } else {
- retval |= eSeqlocPartial_Nointernal;
- }
- }
- } else {
- if (slp == first) {
- retval |= eSeqlocPartial_Start;
- } else {
- retval |= eSeqlocPartial_Internal;
- }
- if (itv.GetFrom() != 0) {
- if (slp == first) {
- retval |= eSeqlocPartial_Nostart;
- } else {
- retval |= eSeqlocPartial_Nointernal;
- }
- }
- }
- }
- }
- }
-
- if (itv.IsSetFuzz_to()) {
- const CInt_fuzz& fuzz = itv.GetFuzz_to();
- CInt_fuzz::ELim lim = fuzz.IsLim() ?
- fuzz.GetLim() : CInt_fuzz::eLim_unk;
- if (lim == CInt_fuzz::eLim_lt) {
- retval |= eSeqlocPartial_Limwrong;
- } else if (lim == CInt_fuzz::eLim_gt ||
- lim == CInt_fuzz::eLim_unk) {
- CBioseq_Handle hnd =
- scope->GetBioseqHandle(itv.GetId());
- bool miss_end = false;
- if ( hnd ) {
- CBioseq_Handle::TBioseqCore bc = hnd.GetBioseqCore();
-
- if (itv.GetTo() != bc->GetInst().GetLength() - 1) {
- miss_end = true;
- }
- }
- if (itv.IsSetStrand() &&
- itv.GetStrand() == eNa_strand_minus) {
- if (slp == first) {
- retval |= eSeqlocPartial_Start;
- } else {
- retval |= eSeqlocPartial_Internal;
- }
- if (miss_end) {
- if (slp == first /* was last */) {
- retval |= eSeqlocPartial_Nostart;
- } else {
- retval |= eSeqlocPartial_Nointernal;
- }
- }
- } else {
- if (slp == last) {
- retval |= eSeqlocPartial_Stop;
- } else {
- retval |= eSeqlocPartial_Internal;
- }
- if (miss_end) {
- if (slp == last) {
- retval |= eSeqlocPartial_Nostop;
- } else {
- retval |= eSeqlocPartial_Nointernal;
- }
- }
- }
- }
- }
- }
- break;
- case CSeq_loc::e_Pnt:
- if (slp->GetPnt().IsSetFuzz()) {
- const CInt_fuzz& fuzz = slp->GetPnt().GetFuzz();
- if (fuzz.Which() == CInt_fuzz::e_Lim) {
- CInt_fuzz::ELim lim = fuzz.GetLim();
- if (lim == CInt_fuzz::eLim_gt ||
- lim == CInt_fuzz::eLim_lt ||
- lim == CInt_fuzz::eLim_unk) {
- if (slp == first) {
- retval |= eSeqlocPartial_Start;
- } else if (slp == last) {
- retval |= eSeqlocPartial_Stop;
- } else {
- retval |= eSeqlocPartial_Internal;
- }
- }
- }
- }
- break;
- case CSeq_loc::e_Packed_pnt:
- if (slp->GetPacked_pnt().IsSetFuzz()) {
- const CInt_fuzz& fuzz = slp->GetPacked_pnt().GetFuzz();
- if (fuzz.Which() == CInt_fuzz::e_Lim) {
- CInt_fuzz::ELim lim = fuzz.GetLim();
- if (lim == CInt_fuzz::eLim_gt ||
- lim == CInt_fuzz::eLim_lt ||
- lim == CInt_fuzz::eLim_unk) {
- if (slp == first) {
- retval |= eSeqlocPartial_Start;
- } else if (slp == last) {
- retval |= eSeqlocPartial_Stop;
- } else {
- retval |= eSeqlocPartial_Internal;
- }
- }
- }
- }
- break;
- case CSeq_loc::e_Whole:
- {
- // Get the Bioseq referred to by Whole
- CBioseq_Handle bsh = scope->GetBioseqHandle(slp->GetWhole());
- if ( !bsh ) {
- break;
- }
- // Check for CMolInfo on the biodseq
- CSeqdesc_CI di( bsh, CSeqdesc::e_Molinfo );
- if ( !di ) {
- // If no CSeq_descr, nothing can be done
- break;
- }
- // First try to loop through CMolInfo
- const CMolInfo& mi = di->GetMolinfo();
- if (!mi.IsSetCompleteness()) {
- continue;
- }
- switch (mi.GetCompleteness()) {
- case CMolInfo::eCompleteness_no_left:
- if (slp == first) {
- retval |= eSeqlocPartial_Start;
- } else {
- retval |= eSeqlocPartial_Internal;
- }
- break;
- case CMolInfo::eCompleteness_no_right:
- if (slp == last) {
- retval |= eSeqlocPartial_Stop;
- } else {
- retval |= eSeqlocPartial_Internal;
- }
- break;
- case CMolInfo::eCompleteness_partial:
- retval |= eSeqlocPartial_Other;
- break;
- case CMolInfo::eCompleteness_no_ends:
- retval |= eSeqlocPartial_Start;
- retval |= eSeqlocPartial_Stop;
- break;
- default:
- break;
- }
- break;
- }
- default:
- break;
- }
- }
- return retval;
- }
- typedef CSeq_loc::TRange TRange;
- typedef vector<TRange> TRangeVec;
- CSeq_loc* SeqLocMerge
- (const CBioseq_Handle& target,
- const CSeq_loc& loc1,
- const CSeq_loc& loc2,
- TSeqLocFlags flags)
- {
- _ASSERT(target);
- vector< CConstRef<CSeq_loc> > locs;
- locs.push_back(CConstRef<CSeq_loc>(&loc1));
- locs.push_back(CConstRef<CSeq_loc>(&loc2));
- return SeqLocMerge(target, locs, flags);
- }
- static void s_CollectRanges(const CSeq_loc& loc, TRangeVec& ranges)
- {
- // skipping empty / nulls
- for ( CSeq_loc_CI li(loc); li; ++li ) {
- ranges.push_back(li.GetRange());
- }
- }
- static void s_MergeRanges(TRangeVec& ranges, TSeqLocFlags flags)
- {
- bool merge = (flags & fMergeIntervals) != 0;
- bool fuse = (flags & fFuseAbutting) != 0;
- if ( !merge && !fuse ) {
- return;
- }
- TRangeVec::iterator next = ranges.begin();
- TRangeVec::iterator curr = next++;
- bool advance = true;
- while ( next != ranges.end() ) {
- advance = true;
- if ( (merge && curr->GetTo() >= next->GetFrom()) ||
- (fuse && curr->GetTo() + 1 == next->GetFrom()) ) {
- *curr += *next;
- next = ranges.erase(next);
- advance = false;
- }
- if ( advance ) {
- curr = next++;
- }
- }
- }
- static CSeq_loc* s_CreateSeqLoc
- (CSeq_id& id,
- const TRangeVec& ranges,
- ENa_strand strand,
- bool rearranged,
- bool add_null)
- {
- static const CSeq_loc null_loc(CSeq_loc::e_Null);
- bool has_pnt = false;
- bool has_int = false;
- if ( ranges.size() < 2 ) {
- add_null = false;
- }
- if ( !add_null ) {
- ITERATE(TRangeVec, it, ranges) {
- if ( it->GetLength() == 1 ) {
- has_pnt = true;
- } else {
- has_int = true;
- }
- }
- }
-
- CSeq_loc::E_Choice loc_type = CSeq_loc::e_not_set;
- if ( add_null ) {
- loc_type = CSeq_loc::e_Mix;
- } else if ( has_pnt && has_int ) { // points and intervals => mix location
- loc_type = CSeq_loc::e_Mix;
- } else if ( has_pnt ) { // only points
- loc_type = ranges.size() == 1 ? CSeq_loc::e_Pnt : CSeq_loc::e_Packed_pnt;
- } else if ( has_int ) { // only intervals
- loc_type = ranges.size() == 1 ? CSeq_loc::e_Int : CSeq_loc::e_Packed_int;
- } else {
- loc_type = CSeq_loc::e_Null;
- }
- CRef<CSeq_loc> result;
- switch ( loc_type ) {
- case CSeq_loc::e_Pnt:
- {
- result.Reset(new CSeq_loc(id, ranges.front().GetFrom(), strand));
- break;
- }
- case CSeq_loc::e_Packed_pnt:
- {
- CPacked_seqpnt::TPoints points(ranges.size());;
- ITERATE(TRangeVec, it, ranges) {
- points.push_back(it->GetFrom());
- }
- result.Reset(new CSeq_loc(id, points, strand));
-
- break;
- }
- case CSeq_loc::e_Int:
- {
- const TRange& r = ranges.front();
- result.Reset(new CSeq_loc(id, r.GetFrom(), r.GetTo(), strand));
- break;
- }
- case CSeq_loc::e_Packed_int:
- {
- result.Reset(new CSeq_loc(id, ranges, strand));
- if ( rearranged ) {
- // the first 2 intervals span the origin
- CPacked_seqint::Tdata ivals = result->SetPacked_int().Set();
- CPacked_seqint::Tdata::iterator it = ivals.begin();
- if ( strand == eNa_strand_minus ) {
- (*it)->SetFuzz_from().SetLim(CInt_fuzz::eLim_circle);
- ++it;
- (*it)->SetFuzz_to().SetLim(CInt_fuzz::eLim_circle);
- } else {
- (*it)->SetFuzz_to().SetLim(CInt_fuzz::eLim_circle);
- ++it;
- (*it)->SetFuzz_from().SetLim(CInt_fuzz::eLim_circle);
- }
- }
- break;
- }
- case CSeq_loc::e_Mix:
- {
- result.Reset(new CSeq_loc);
- CSeq_loc_mix& mix = result->SetMix();
- TRangeVec::const_iterator last = ranges.end(); --last;
- ITERATE(TRangeVec, it, ranges) {
- if ( it->GetLength() == 1 ) {
- mix.AddSeqLoc(*new CSeq_loc(id, it->GetFrom(), strand));
- } else {
- mix.AddSeqLoc(*new CSeq_loc(id, it->GetFrom(), it->GetTo(), strand));
- }
- if ( add_null && it != last ) {
- mix.AddSeqLoc(null_loc);
- }
- }
- break;
- }
- case CSeq_loc::e_Null:
- {
- result.Reset(new CSeq_loc);
- result->SetNull();
- break;
- }
- default:
- {
- result.Reset();
- }
- };
- return result.Release();
- }
- static void s_RearrangeRanges(TRangeVec& ranges)
- {
- deque<TRange> temp(ranges.size());
- copy(ranges.begin(), ranges.end(), temp.begin());
- ranges.clear();
- temp.push_front(temp.back());
- temp.pop_back();
- copy(temp.begin(), temp.end(), back_inserter(ranges));
- }
- CSeq_loc* SeqLocMergeOne
- (const CBioseq_Handle& target,
- const CSeq_loc& loc,
- TSeqLocFlags flags)
- {
- _ASSERT(target);
- TRangeVec ranges;
- CRef<CSeq_id> id(new CSeq_id);
- id->Assign(GetId(target, eGetId_Best));
- const CSeq_inst& inst = target.GetInst();
- CSeq_inst::TTopology topology = inst.CanGetTopology() ?
- inst.GetTopology() : CSeq_inst::eTopology_not_set;
- bool circular = (topology == CSeq_inst::eTopology_circular);
- if ( circular ) { // circular topology overrides fSingleInterval flag
- flags &= ~fSingleInterval;
- }
- TSeqPos seq_len = inst.IsSetLength() ? inst.GetLength() : 0;
- // map the location to the target bioseq
- CRef<CSeq_loc> mapped_loc(target.MapLocation(loc));
- _ASSERT(IsOneBioseq(*mapped_loc)); // doesn't have multiple bioseqs
- ENa_strand strand = GetStrand(*mapped_loc);
- bool rearranged = false;
- if ( (flags & fSingleInterval) != 0 ) {
- ranges.push_back(mapped_loc->GetTotalRange());
- } else {
- if ( strand == eNa_strand_other ) { // multiple strands in location
- return mapped_loc.Release();
- }
-
- s_CollectRanges(*mapped_loc, ranges);
- sort(ranges.begin(), ranges.end()); // on the plus strand
- s_MergeRanges(ranges, flags);
- if ( strand == eNa_strand_minus ) {
- reverse(ranges.begin(), ranges.end());
- }
- // if circular bioseq and ranges span the origin, rearrange
- if ( circular ) {
- if ( ((strand == eNa_strand_minus) &&
- (ranges.front().GetTo() == seq_len - 1) &&
- (ranges.back().GetFrom() == 0)) ||
- ((strand != eNa_strand_minus) &&
- (ranges.front().GetFrom() == 0) &&
- (ranges.back().GetTo() == seq_len - 1)) ) {
- rearranged = true;
- s_RearrangeRanges(ranges);
- }
- }
- }
- return s_CreateSeqLoc(*id, ranges, strand, rearranged, (flags & fAddNulls) != 0);
- }
- static CSeq_interval* s_SeqIntRevCmp(const CSeq_interval& i, CScope* scope)
- {
- CRef<CSeq_interval> rev_int(new CSeq_interval);
- rev_int->Assign(i);
-
- ENa_strand s = i.CanGetStrand() ? i.GetStrand() : eNa_strand_unknown;
- rev_int->SetStrand(Reverse(s));
- return rev_int.Release();
- }
- static CSeq_point* s_SeqPntRevCmp(const CSeq_point& pnt, CScope* scope)
- {
- CRef<CSeq_point> rev_pnt(new CSeq_point);
- rev_pnt->Assign(pnt);
-
- ENa_strand s = pnt.CanGetStrand() ? pnt.GetStrand() : eNa_strand_unknown;
- rev_pnt->SetStrand(Reverse(s));
- return rev_pnt.Release();
- }
- CSeq_loc* SeqLocRevCmp(const CSeq_loc& loc, CScope* scope)
- {
- CRef<CSeq_loc> rev_loc(new CSeq_loc);
- switch ( loc.Which() ) {
- // -- reverse is the same.
- case CSeq_loc::e_Null:
- case CSeq_loc::e_Empty:
- case CSeq_loc::e_Whole:
- rev_loc->Assign(loc);
- break;
- // -- just reverse the strand
- case CSeq_loc::e_Int:
- rev_loc->SetInt(*s_SeqIntRevCmp(loc.GetInt(), scope));
- break;
- case CSeq_loc::e_Pnt:
- rev_loc->SetPnt(*s_SeqPntRevCmp(loc.GetPnt(), scope));
- break;
- case CSeq_loc::e_Packed_pnt:
- rev_loc->SetPacked_pnt().Assign(loc.GetPacked_pnt());
- rev_loc->SetPacked_pnt().SetStrand(Reverse(GetStrand(loc, scope)));
- break;
- // -- possibly more than one sequence
- case CSeq_loc::e_Packed_int:
- {
- // reverse each interval and store them in reverse order
- typedef CRef< CSeq_interval > TInt;
- CPacked_seqint& pint = rev_loc->SetPacked_int();
- ITERATE (CPacked_seqint::Tdata, it, loc.GetPacked_int().Get()) {
- pint.Set().push_front(TInt(s_SeqIntRevCmp(**it, scope)));
- }
- break;
- }
- case CSeq_loc::e_Mix:
- {
- // reverse each location and store them in reverse order
- typedef CRef< CSeq_loc > TLoc;
- CSeq_loc_mix& mix = rev_loc->SetMix();
- ITERATE (CSeq_loc_mix::Tdata, it, loc.GetMix().Get()) {
- mix.Set().push_front(TLoc(SeqLocRevCmp(**it, scope)));
- }
- break;
- }
- case CSeq_loc::e_Equiv:
- {
- // reverse each location (no need to reverse order)
- typedef CRef< CSeq_loc > TLoc;
- CSeq_loc_equiv& e = rev_loc->SetEquiv();
- ITERATE (CSeq_loc_equiv::Tdata, it, loc.GetEquiv().Get()) {
- e.Set().push_back(TLoc(SeqLocRevCmp(**it, scope)));
- }
- break;
- }
- case CSeq_loc::e_Bond:
- {
- CSeq_bond& bond = rev_loc->SetBond();
- bond.SetA(*s_SeqPntRevCmp(loc.GetBond().GetA(), scope));
- if ( loc.GetBond().CanGetB() ) {
- bond.SetA(*s_SeqPntRevCmp(loc.GetBond().GetB(), scope));
- }
- }
-
- // -- not supported
- case CSeq_loc::e_Feat:
- default:
- NCBI_THROW(CException, eUnknown,
- "CSeq_loc::SeqLocRevCmp -- unsupported location type");
- }
- return rev_loc.Release();
- }
- // Get the encoding CDS feature of a given protein sequence.
- const CSeq_feat* GetCDSForProduct(const CBioseq& product, CScope* scope)
- {
- if ( scope == 0 ) {
- return 0;
- }
- return GetCDSForProduct(scope->GetBioseqHandle(product));
- }
- const CSeq_feat* GetCDSForProduct(const CBioseq_Handle& bsh)
- {
- if ( bsh ) {
- CFeat_CI fi(bsh,
- 0, 0,
- CSeqFeatData::e_Cdregion,
- SAnnotSelector::eOverlap_Intervals,
- SAnnotSelector::eResolve_TSE,
- CFeat_CI::e_Product,
- 0);
- if ( fi ) {
- // return the first one (should be the one packaged on the
- // nuc-prot set).
- return &(fi->GetOriginalFeature());
- }
- }
- return 0;
- }
- // Get the mature peptide feature of a protein
- const CSeq_feat* GetPROTForProduct(const CBioseq& product, CScope* scope)
- {
- if ( scope == 0 ) {
- return 0;
- }
- return GetPROTForProduct(scope->GetBioseqHandle(product));
- }
- const CSeq_feat* GetPROTForProduct(const CBioseq_Handle& bsh)
- {
- if ( bsh ) {
- CFeat_CI fi(bsh,
- 0, 0,
- CSeqFeatData::e_Prot,
- SAnnotSelector::eOverlap_Intervals,
- SAnnotSelector::eResolve_TSE,
- CFeat_CI::e_Product,
- 0);
- if ( fi ) {
- return &(fi->GetOriginalFeature());
- }
- }
- return 0;
- }
- // Get the encoding mRNA feature of a given mRNA (cDNA) bioseq.
- const CSeq_feat* GetmRNAForProduct(const CBioseq& product, CScope* scope)
- {
- if ( scope == 0 ) {
- return 0;
- }
- return GetmRNAForProduct(scope->GetBioseqHandle(product));
- }
- const CSeq_feat* GetmRNAForProduct(const CBioseq_Handle& bsh)
- {
- if ( bsh ) {
- SAnnotSelector as;
- as.SetFeatSubtype(CSeqFeatData::eSubtype_mRNA);
- as.SetByProduct();
- CFeat_CI fi(bsh, 0, 0, as);
- if ( fi ) {
- return &(fi->GetOriginalFeature());
- }
- }
- return 0;
- }
- // Get the encoding sequence of a protein
- const CBioseq* GetNucleotideParent(const CBioseq& product, CScope* scope)
- {
- if ( scope == 0 ) {
- return 0;
- }
- CBioseq_Handle bsh = GetNucleotideParent(scope->GetBioseqHandle(product));
- return bsh ? &(bsh.GetBioseq()) : 0;
- }
- CBioseq_Handle GetNucleotideParent(const CBioseq_Handle& bsh)
- {
- // If protein use CDS to get to the encoding Nucleotide.
- // if nucleotide (cDNA) use mRNA feature.
- const CSeq_feat* sfp = bsh.GetBioseqMolType() == CSeq_inst::eMol_aa ?
- GetCDSForProduct(bsh) : GetmRNAForProduct(bsh);
- CBioseq_Handle ret;
- if ( sfp ) {
- ret = bsh.GetScope().GetBioseqHandle(sfp->GetLocation());
- }
- return ret;
- }
- END_SCOPE(sequence)
- void CFastaOstream::Write(const CBioseq_Handle& handle,
- const CSeq_loc* location)
- {
- WriteTitle(handle);
- WriteSequence(handle, location);
- }
- void CFastaOstream::WriteTitle(const CBioseq_Handle& handle)
- {
- m_Out << '>' << CSeq_id::GetStringDescr(*handle.GetBioseqCore(),
- CSeq_id::eFormat_FastA)
- << ' ' << sequence::GetTitle(handle) << NcbiEndl;
- }
- // XXX - replace with SRelLoc?
- struct SGap {
- SGap(TSeqPos start, TSeqPos length) : m_Start(start), m_Length(length) { }
- TSeqPos GetEnd(void) const { return m_Start + m_Length - 1; }
- TSeqPos m_Start, m_Length;
- };
- typedef list<SGap> TGaps;
- static bool s_IsGap(const CSeq_loc& loc, CScope& scope)
- {
- if (loc.IsNull()) {
- return true;
- }
- CTypeConstIterator<CSeq_id> id(loc);
- CBioseq_Handle handle = scope.GetBioseqHandle(*id);
- if (handle && handle.GetBioseqCore()->GetInst().GetRepr()
- == CSeq_inst::eRepr_virtual) {
- return true;
- }
- return false; // default
- }
- static TGaps s_FindGaps(const CSeq_ext& ext, CScope& scope)
- {
- TSeqPos pos = 0;
- TGaps gaps;
- switch (ext.Which()) {
- case CSeq_ext::e_Seg:
- ITERATE (CSeg_ext::Tdata, it, ext.GetSeg().Get()) {
- TSeqPos length = sequence::GetLength(**it, &scope);
- if (s_IsGap(**it, scope)) {
- gaps.push_back(SGap(pos, length));
- }
- pos += length;
- }
- break;
- case CSeq_ext::e_Delta:
- ITERATE (CDelta_ext::Tdata, it, ext.GetDelta().Get()) {
- switch ((*it)->Which()) {
- case CDelta_seq::e_Loc:
- {
- const CSeq_loc& loc = (*it)->GetLoc();
- TSeqPos length = sequence::GetLength(loc, &scope);
- if (s_IsGap(loc, scope)) {
- gaps.push_back(SGap(pos, length));
- }
- pos += length;
- break;
- }
- case CDelta_seq::e_Literal:
- {
- const CSeq_literal& lit = (*it)->GetLiteral();
- TSeqPos length = lit.GetLength();
- if ( !lit.IsSetSeq_data() ) {
- gaps.push_back(SGap(pos, length));
- }
- pos += length;
- break;
- }
- default:
- ERR_POST(Warning << "CFastaOstream::WriteSequence: "
- "unsupported Delta-seq selection "
- << CDelta_seq::SelectionName((*it)->Which()));
- break;
- }
- }
- default:
- break;
- }
- return gaps;
- }
- static TGaps s_AdjustGaps(const TGaps& gaps, const CSeq_loc& location)
- {
- // assume location matches handle
- const TSeqPos kMaxPos = numeric_limits<TSeqPos>::max();
- TSeqPos pos = 0;
- TGaps::const_iterator gap_it = gaps.begin();
- TGaps adjusted_gaps;
- SGap new_gap(kMaxPos, 0);
- for (CSeq_loc_CI loc_it(location); loc_it && gap_it != gaps.end();
- pos += loc_it.GetRange().GetLength(), ++loc_it) {
- CSeq_loc_CI::TRange range = loc_it.GetRange();
- if (new_gap.m_Start != kMaxPos) {
- // in progress
- if (gap_it->GetEnd() < range.GetFrom()) {
- adjusted_gaps.push_back(new_gap);
- new_gap.m_Start = kMaxPos;
- ++gap_it;
- } else if (gap_it->GetEnd() <= range.GetTo()) {
- new_gap.m_Length += gap_it->GetEnd() - range.GetFrom() + 1;
- adjusted_gaps.push_back(new_gap);
- new_gap.m_Start = kMaxPos;
- ++gap_it;
- } else {
- new_gap.m_Length += range.GetLength();
- continue;
- }
- }
- while (gap_it->GetEnd() < range.GetFrom()) {
- ++gap_it; // skip
- }
- if (gap_it->m_Start <= range.GetFrom()) {
- if (gap_it->GetEnd() <= range.GetTo()) {
- adjusted_gaps.push_back
- (SGap(pos, gap_it->GetEnd() - range.GetFrom() + 1));
- ++gap_it;
- } else {
- new_gap.m_Start = pos;
- new_gap.m_Length = range.GetLength();
- continue;
- }
- }
- while (gap_it->m_Start <= range.GetTo()) {
- TSeqPos pos2 = pos + gap_it->m_Start - range.GetFrom();
- if (gap_it->GetEnd() <= range.GetTo()) {
- adjusted_gaps.push_back(SGap(pos2, gap_it->m_Length));
- ++gap_it;
- } else {
- new_gap.m_Start = pos2;
- new_gap.m_Length = range.GetTo() - gap_it->m_Start + 1;
- }
- }
- }
- if (new_gap.m_Start != kMaxPos) {
- adjusted_gaps.push_back(new_gap);
- }
- return adjusted_gaps;
- }
- void CFastaOstream::WriteSequence(const CBioseq_Handle& handle,
- const CSeq_loc* location)
- {
- CConstRef<CBioseq> seq = handle.GetCompleteBioseq();
- const CSeq_inst& inst = seq->GetInst();
- if ( !(m_Flags & eAssembleParts) && !inst.IsSetSeq_data() ) {
- return;
- }
- CSeqVector v = (location
- ? handle.GetSequenceView(*location,
- CBioseq_Handle::eViewMerged,
- CBioseq_Handle::eCoding_Iupac)
- : handle.GetSeqVector(CBioseq_Handle::eCoding_Iupac));
- bool is_na = inst.GetMol() != CSeq_inst::eMol_aa;
- // autodetection is sometimes broken (!)
- v.SetCoding(is_na ? CSeq_data::e_Iupacna : CSeq_data::e_Iupacaa);
- TSeqPos size = v.size();
- TSeqPos pos = 0;
- CSeqVector::TResidue gap = v.GetGapChar();
- string buffer;
- TGaps gaps;
- CScope& scope = handle.GetScope();
- const TSeqPos kMaxPos = numeric_limits<TSeqPos>::max();
- if ( !inst.IsSetSeq_data() && inst.IsSetExt() ) {
- gaps = s_FindGaps(inst.GetExt(), scope);
- if (location) {
- gaps = s_AdjustGaps(gaps, *location);
- }
- }
- gaps.push_back(SGap(kMaxPos, 0));
- while (pos < size) {
- unsigned int limit = min(m_Width,
- min(size, gaps.front().m_Start) - pos);
- v.GetSeqData(pos, pos + limit, buffer);
- pos += limit;
- if (limit > 0) {
- m_Out << buffer << 'n';
- }
- if (pos == gaps.front().m_Start) {
- if (m_Flags & eInstantiateGaps) {
- for (TSeqPos l = gaps.front().m_Length; l > 0; l -= m_Width) {
- m_Out << string(min(l, m_Width), gap) << 'n';
- }
- } else {
- m_Out << "-n";
- }
- pos += gaps.front().m_Length;
- gaps.pop_front();
- }
- }
- m_Out << NcbiFlush;
- }
- void CFastaOstream::Write(CSeq_entry& entry, const CSeq_loc* location)
- {
- CObjectManager om;
- CScope scope(om);
- scope.AddTopLevelSeqEntry(entry);
- for (CTypeConstIterator<CBioseq> it(entry); it; ++it) {
- if ( !SkipBioseq(*it) ) {
- CBioseq_Handle h = scope.GetBioseqHandle(*it->GetId().front());
- Write(h, location);
- }
- }
- }
- void CFastaOstream::Write(CBioseq& seq, const CSeq_loc* location)
- {
- CSeq_entry entry;
- entry.SetSeq(seq);
- Write(entry, location);
- }
- /////////////////////////////////////////////////////////////////////////////
- //
- // sequence translation
- //
- template <class Container>
- void x_Translate(const Container& seq,
- string& prot,
- const CGenetic_code* code)
- {
- // reserve our space
- const size_t mod = seq.size() % 3;
- prot.erase();
- prot.reserve(seq.size() / 3 + (mod ? 1 : 0));
- // get appropriate translation table
- const CTrans_table & tbl =
- (code ? CGen_code_table::GetTransTable(*code) :
- CGen_code_table::GetTransTable(1));
- // main loop through bases
- typename Container::const_iterator start = seq.begin();
- size_t i;
- size_t k;
- size_t state = 0;
- size_t length = seq.size() / 3;
- for (i = 0; i < length; ++i) {
- // loop through one codon at a time
- for (k = 0; k < 3; ++k, ++start) {
- state = tbl.NextCodonState(state, *start);
- }
- // save translated amino acid
- prot.append(1, tbl.GetCodonResidue(state));
- }
- if (mod) {
- LOG_POST(Warning <<
- "translation of sequence whose length "
- "is not an even number of codons");
- for (k = 0; k < mod; ++k, ++start) {
- state = tbl.NextCodonState(state, *start);
- }
- for ( ; k < 3; ++k) {
- state = tbl.NextCodonState(state, 'N');
- }
- // save translated amino acid
- prot.append(1, tbl.GetCodonResidue(state));
- }
- }
- void CSeqTranslator::Translate(const string& seq, string& prot,
- const CGenetic_code* code,
- bool include_stop,
- bool remove_trailing_X)
- {
- x_Translate(seq, prot, code);
- }
- void CSeqTranslator::Translate(const CSeqVector& seq, string& prot,
- const CGenetic_code* code,
- bool include_stop,
- bool remove_trailing_X)
- {
- x_Translate(seq, prot, code);
- }
- void CSeqTranslator::Translate(const CSeq_loc& loc,
- const CBioseq_Handle& handle,
- string& prot,
- const CGenetic_code* code,
- bool include_stop,
- bool remove_trailing_X)
- {
- CSeqVector seq =
- handle.GetSequenceView(loc, CBioseq_Handle::eViewConstructed,
- CBioseq_Handle::eCoding_Iupac);
- x_Translate(seq, prot, code);
- }
- void CCdregion_translate::ReadSequenceByLocation (string& seq,
- const CBioseq_Handle& bsh,
- const CSeq_loc& loc)
- {
- // get vector of sequence under location
- CSeqVector seqv = bsh.GetSequenceView (loc,
- CBioseq_Handle::eViewConstructed,
- CBioseq_Handle::eCoding_Iupac);
- seqv.GetSeqData(0, seqv.size(), seq);
- }
- void CCdregion_translate::TranslateCdregion (string& prot,
- const CBioseq_Handle& bsh,
- const CSeq_loc& loc,
- const CCdregion& cdr,
- bool include_stop,
- bool remove_trailing_X,
- bool* alt_start)
- {
- // clear contents of result string
- prot.erase();
- if ( alt_start != 0 ) {
- *alt_start = false;
- }
- // copy bases from coding region location
- string bases = "";
- ReadSequenceByLocation (bases, bsh, loc);
- // calculate offset from frame parameter
- int offset = 0;
- if (cdr.IsSetFrame ()) {
- switch (cdr.GetFrame ()) {
- case CCdregion::eFrame_two :
- offset = 1;
- break;
- case CCdregion::eFrame_three :
- offset = 2;
- break;
- default :
- break;
- }
- }
- int dnalen = bases.size () - offset;
- if (dnalen < 1) return;
- // pad bases string if last codon is incomplete
- bool incomplete_last_codon = false;
- int mod = dnalen % 3;
- if ( mod != 0 ) {
- incomplete_last_codon = true;
- bases += (mod == 1) ? "NN" : "N";
- dnalen += 3 - mod;
- }
- _ASSERT((dnalen >= 3) && ((dnalen % 3) == 0));
- // resize output protein translation string
- prot.resize(dnalen / 3);
- // get appropriate translation table
- const CTrans_table& tbl = cdr.IsSetCode() ?
- CGen_code_table::GetTransTable(cdr.GetCode()) :
- CGen_code_table::GetTransTable(1);
- // main loop through bases
- string::const_iterator it = bases.begin();
- string::const_iterator end = bases.end();
- for ( int i = 0; i < offset; ++i ) {
- ++it;
- }
- unsigned int state = 0, j = 0;
- while ( it != end ) {
- // get one codon at a time
- state = tbl.NextCodonState(state, *it++);
- state = tbl.NextCodonState(state, *it++);
- state = tbl.NextCodonState(state, *it++);
- // save translated amino acid
- prot[j++] = tbl.GetCodonResidue(state);
- }
- // check for alternative start codon
- if ( alt_start != 0 ) {
- state = 0;
- state = tbl.NextCodonState (state, bases[0]);
- state = tbl.NextCodonState (state, bases[1]);
- state = tbl.NextCodonState (state, bases[2]);
- if ( tbl.IsAltStart(state) ) {
- *alt_start = true;
- }
- }
- // if complete at 5' end, require valid start codon
- if (offset == 0 && (! loc.IsPartialLeft ())) {
- state = tbl.SetCodonState (bases [offset], bases [offset + 1], bases [offset + 2]);
- prot [0] = tbl.GetStartResidue (state);
- }
- // code break substitution
- if (cdr.IsSetCode_break ()) {
- SIZE_TYPE protlen = prot.size ();
- ITERATE (CCdregion::TCode_break, code_break, cdr.GetCode_break ()) {
- const CRef <CCode_break> brk = *code_break;
- const CSeq_loc& cbk_loc = brk->GetLoc ();
- TSeqPos seq_pos = sequence::LocationOffset (loc, cbk_loc, sequence::eOffset_FromStart, &bsh.GetScope ());
- seq_pos -= offset;
- SIZE_TYPE i = seq_pos / 3;
- if (i >= 0 && i < protlen) {
- const CCode_break::C_Aa& c_aa = brk->GetAa ();
- if (c_aa.IsNcbieaa ()) {
- prot [i] = c_aa.GetNcbieaa ();
- }
- }
- }
- }
- // optionally truncate at first terminator
- if (! include_stop) {
- SIZE_TYPE protlen = prot.size ();
- for (SIZE_TYPE i = 0; i < protlen; i++) {
- if (prot [i] == '*') {
- prot.resize (i);
- return;
- }
- }
- }
- // if padding was needed, trim ambiguous last residue
- if (incomplete_last_codon) {
- int protlen = prot.size ();
- if (protlen > 0 && prot [protlen - 1] == 'X') {
- protlen--;
- prot.resize (protlen);
- }
- }
- // optionally remove trailing X on 3' partial coding region
- if (remove_trailing_X) {
- int protlen = prot.size ();
- while (protlen > 0 && prot [protlen - 1] == 'X') {
- protlen--;
- }
- prot.resize (protlen);
- }
- }
- void CCdregion_translate::TranslateCdregion
- (string& prot,
- const CSeq_feat& cds,
- CScope& scope,
- bool include_stop,
- bool remove_trailing_X,
- bool* alt_start)
- {
- _ASSERT(cds.GetData().IsCdregion());
- prot.erase();
- CBioseq_Handle bsh = scope.GetBioseqHandle(cds.GetLocation());
- if ( !bsh ) {
- return;
- }
- CCdregion_translate::TranslateCdregion(
- prot,
- bsh,
- cds.GetLocation(),
- cds.GetData().GetCdregion(),
- include_stop,
- remove_trailing_X,
- alt_start);
- }
- SRelLoc::SRelLoc(const CSeq_loc& parent, const CSeq_loc& child, CScope* scope,
- SRelLoc::TFlags flags)
- : m_ParentLoc(&parent)
- {
- typedef CSeq_loc::TRange TRange0;
- for (CSeq_loc_CI cit(child); cit; ++cit) {
- const CSeq_id& cseqid = cit.GetSeq_id();
- TRange0 crange = cit.GetRange();
- if (crange.IsWholeTo() && scope) {
- // determine actual end
- crange.SetToOpen(sequence::GetLength(cit.GetSeq_id(), scope));
- }
- ENa_strand cstrand = cit.GetStrand();
- TSeqPos pos = 0;
- for (CSeq_loc_CI pit(parent); pit; ++pit) {
- ENa_strand pstrand = pit.GetStrand();
- TRange0 prange = pit.GetRange();
- if (prange.IsWholeTo() && scope) {
- // determine actual end
- prange.SetToOpen(sequence::GetLength(pit.GetSeq_id(), scope));
- }
- if ( !sequence::IsSameBioseq(cseqid, pit.GetSeq_id(), scope) ) {
- pos += prange.GetLength();
- continue;
- }
- CRef<TRange> intersection(new TRange);
- TSeqPos abs_from, abs_to;
- CConstRef<CInt_fuzz> fuzz_from, fuzz_to;
- if (crange.GetFrom() >= prange.GetFrom()) {
- abs_from = crange.GetFrom();
- fuzz_from = cit.GetFuzzFrom();
- if (abs_from == prange.GetFrom()) {
- // subtract out parent fuzz, if any
- const CInt_fuzz* pfuzz = pit.GetFuzzFrom();
- if (pfuzz) {
- if (fuzz_from) {
- CRef<CInt_fuzz> f(new CInt_fuzz);
- f->Assign(*fuzz_from);
- f->Subtract(*pfuzz, abs_from, abs_from);
- if (f->IsP_m() && !f->GetP_m() ) {
- fuzz_from.Reset(); // cancelled
- } else {
- fuzz_from = f;
- }
- } else {
- fuzz_from = pfuzz->Negative(abs_from);
- }
- }
- }
- } else {
- abs_from = prange.GetFrom();
- // fuzz_from = pit.GetFuzzFrom();
- CRef<CInt_fuzz> f(new CInt_fuzz);
- f->SetLim(CInt_fuzz::eLim_lt);
- fuzz_from = f;
- }
- if (crange.GetTo() <= prange.GetTo()) {
- abs_to = crange.GetTo();
- fuzz_to = cit.GetFuzzTo();
- if (abs_to == prange.GetTo()) {
- // subtract out parent fuzz, if any
- const CInt_fuzz* pfuzz = pit.GetFuzzTo();
- if (pfuzz) {
- if (fuzz_to) {
- CRef<CInt_fuzz> f(new CInt_fuzz);
- f->Assign(*fuzz_to);
- f->Subtract(*pfuzz, abs_to, abs_to);
- if (f->IsP_m() && !f->GetP_m() ) {
- fuzz_to.Reset(); // cancelled
- } else {
- fuzz_to = f;
- }
- } else {
- fuzz_to = pfuzz->Negative(abs_to);
- }
- }
- }
- } else {
- abs_to = prange.GetTo();
- // fuzz_to = pit.GetFuzzTo();
- CRef<CInt_fuzz> f(new CInt_fuzz);
- f->SetLim(CInt_fuzz::eLim_gt);
- fuzz_to = f;
- }
- if (abs_from <= abs_to) {
- if (IsReverse(pstrand)) {
- TSeqPos sigma = pos + prange.GetTo();
- intersection->SetFrom(sigma - abs_to);
- intersection->SetTo (sigma - abs_from);
- if (fuzz_from) {
- intersection->SetFuzz_to().AssignTranslated
- (*fuzz_from, intersection->GetTo(), abs_from);
- intersection->SetFuzz_to().Negate
- (intersection->GetTo());
- }
- if (fuzz_to) {
- intersection->SetFuzz_from().AssignTranslated
- (*fuzz_to, intersection->GetFrom(), abs_to);
- intersection->SetFuzz_from().Negate
- (intersection->GetFrom());
- }
- if (cstrand == eNa_strand_unknown) {
- intersection->SetStrand(pstrand);
- } else {
- intersection->SetStrand(Reverse(cstrand));
- }
- } else {
- TSignedSeqPos delta = pos - prange.GetFrom();
- intersection->SetFrom(abs_from + delta);
- intersection->SetTo (abs_to + delta);
- if (fuzz_from) {
- intersection->SetFuzz_from().AssignTranslated
- (*fuzz_from, intersection->GetFrom(), abs_from);
- }
- if (fuzz_to) {
- intersection->SetFuzz_to().AssignTranslated
- (*fuzz_to, intersection->GetTo(), abs_to);
- }
- if (cstrand == eNa_strand_unknown) {
- intersection->SetStrand(pstrand);
- } else {
- intersection->SetStrand(cstrand);
- }
- }
- // add to m_Ranges, combining with the previous
- // interval if possible
- if ( !(flags & fNoMerge) && !m_Ranges.empty()
- && SameOrientation(intersection->GetStrand(),
- m_Ranges.back()->GetStrand()) ) {
- if (m_Ranges.back()->GetTo() == intersection->GetFrom() - 1
- && !IsReverse(intersection->GetStrand()) ) {
- m_Ranges.back()->SetTo(intersection->GetTo());
- if (intersection->IsSetFuzz_to()) {
- m_Ranges.back()->SetFuzz_to
- (intersection->SetFuzz_to());
- } else {
- m_Ranges.back()->ResetFuzz_to();
- }
- } else if (m_Ranges.back()->GetFrom()
- == intersection->GetTo() + 1
- && IsReverse(intersection->GetStrand())) {
- m_Ranges.back()->SetFrom(intersection->GetFrom());
- if (intersection->IsSetFuzz_from()) {
- m_Ranges.back()->SetFuzz_from
- (intersection->SetFuzz_from());
- } else {
- m_Ranges.back()->ResetFuzz_from();
- }
- } else {
- m_Ranges.push_back(intersection);
- }
- } else {
- m_Ranges.push_back(intersection);
- }
- }
- pos += prange.GetLength();
- }
- }
- }
- // Bother trying to merge?
- CRef<CSeq_loc> SRelLoc::Resolve(const CSeq_loc& new_parent, CScope* scope,
- SRelLoc::TFlags /* flags */)
- const
- {
- typedef CSeq_loc::TRange TRange0;
- CRef<CSeq_loc> result(new CSeq_loc);
- CSeq_loc_mix& mix = result->SetMix();
- ITERATE (TRanges, it, m_Ranges) {
- _ASSERT((*it)->GetFrom() <= (*it)->GetTo());
- TSeqPos pos = 0, start = (*it)->GetFrom();
- bool keep_going = true;
- for (CSeq_loc_CI pit(new_parent); pit; ++pit) {
- TRange0 prange = pit.GetRange();
- if (prange.IsWholeTo() && scope) {
- // determine actual end
- prange.SetToOpen(sequence::GetLength(pit.GetSeq_id(), scope));
- }
- TSeqPos length = prange.GetLength();
- if (start >= pos && start < pos + length) {
- TSeqPos from, to;
- CConstRef<CInt_fuzz> fuzz_from, fuzz_to;
- ENa_strand strand;
- if (IsReverse(pit.GetStrand())) {
- TSeqPos sigma = pos + prange.GetTo();
- from = sigma - (*it)->GetTo();
- to = sigma - start;
- if (from < prange.GetFrom() || from > sigma) {
- from = prange.GetFrom();
- keep_going = true;
- } else {
- keep_going = false;
- }
- if ( !(*it)->IsSetStrand()
- || (*it)->GetStrand() == eNa_strand_unknown) {
- strand = pit.GetStrand();
- } else {
- strand = Reverse((*it)->GetStrand());
- }
- if (from == prange.GetFrom()) {
- fuzz_from = pit.GetFuzzFrom();
- }
- if ( !keep_going && (*it)->IsSetFuzz_to() ) {
- CRef<CInt_fuzz> f(new CInt_fuzz);
- if (fuzz_from) {
- f->Assign(*fuzz_from);
- } else {
- f->SetP_m(0);
- }
- f->Subtract((*it)->GetFuzz_to(), from, (*it)->GetTo(),
- CInt_fuzz::eAmplify);
- if (f->IsP_m() && !f->GetP_m() ) {
- fuzz_from.Reset(); // cancelled
- } else {
- fuzz_from = f;
- }
- }
- if (to == prange.GetTo()) {
- fuzz_to = pit.GetFuzzTo();
- }
- if (start == (*it)->GetFrom()
- && (*it)->IsSetFuzz_from()) {
- CRef<CInt_fuzz> f(new CInt_fuzz);
- if (fuzz_to) {
- f->Assign(*fuzz_to);
- } else {
- f->SetP_m(0);
- }
- f->Subtract((*it)->GetFuzz_from(), to,
- (*it)->GetFrom(), CInt_fuzz::eAmplify);
- if (f->IsP_m() && !f->GetP_m() ) {
- fuzz_to.Reset(); // cancelled
- } else {
- fuzz_to = f;
- }
- }
- } else {
- TSignedSeqPos delta = prange.GetFrom() - pos;
- from = start + delta;
- to = (*it)->GetTo() + delta;
- if (to > prange.GetTo()) {
- to = prange.GetTo();
- keep_going = true;
- } else {
- keep_going = false;
- }
- if ( !(*it)->IsSetStrand()
- || (*it)->GetStrand() == eNa_strand_unknown) {
- strand = pit.GetStrand();
- } else {
- strand = (*it)->GetStrand();
- }
- if (from == prange.GetFrom()) {
- fuzz_from = pit.GetFuzzFrom();
- }
- if (start == (*it)->GetFrom()
- && (*it)->IsSetFuzz_from()) {
- CRef<CInt_fuzz> f(new CInt_fuzz);
- if (fuzz_from) {
- f->Assign(*fuzz_from);
- f->Add((*it)->GetFuzz_from(), from,
- (*it)->GetFrom());
- } else {
- f->AssignTranslated((*it)->GetFuzz_from(), from,
- (*it)->GetFrom());
- }
- if (f->IsP_m() && !f->GetP_m() ) {
- fuzz_from.Reset(); // cancelled
- } else {
- fuzz_from = f;
- }
- }
- if (to == prange.GetTo()) {
- fuzz_to = pit.GetFuzzTo();
- }
- if ( !keep_going && (*it)->IsSetFuzz_to() ) {
- CRef<CInt_fuzz> f(new CInt_fuzz);
- if (fuzz_to) {
- f->Assign(*fuzz_to);
- f->Add((*it)->GetFuzz_to(), to, (*it)->GetTo());
- } else {
- f->AssignTranslated((*it)->GetFuzz_to(), to,
- (*it)->GetTo());
- }
- if (f->IsP_m() && !f->GetP_m() ) {
- fuzz_to.Reset(); // cancelled
- } else {
- fuzz_to = f;
- }
- }
- }
- if (from == to
- && (fuzz_from == fuzz_to
- || (fuzz_from.GetPointer() && fuzz_to.GetPointer()
- && fuzz_from->Equals(*fuzz_to)))) {
- // just a point
- CRef<CSeq_loc> loc(new CSeq_loc);
- CSeq_point& point = loc->SetPnt();
- point.SetPoint(from);
- if (strand != eNa_strand_unknown) {
- point.SetStrand(strand);
- }
- if (fuzz_from) {
- point.SetFuzz().Assign(*fuzz_from);
- }
- point.SetId().Assign(pit.GetSeq_id());
- mix.Set().push_back(loc);
- } else {
- CRef<CSeq_loc> loc(new CSeq_loc);
- CSeq_interval& ival = loc->SetInt();
- ival.SetFrom(from);
- ival.SetTo(to);
- if (strand != eNa_strand_unknown) {
- ival.SetStrand(strand);
- }
- if (fuzz_from) {
- ival.SetFuzz_from().Assign(*fuzz_from);
- }
- if (fuzz_to) {
- ival.SetFuzz_to().Assign(*fuzz_to);
- }
- ival.SetId().Assign(pit.GetSeq_id());
- mix.Set().push_back(loc);
- }
- if (keep_going) {
- start = pos + length;
- } else {
- break;
- }
- }
- pos += length;
- }
- if (keep_going) {
- TSeqPos total_length;
- string label;
- new_parent.GetLabel(&label);
- try {
- total_length = sequence::GetLength(new_parent, scope);
- ERR_POST(Warning << "SRelLoc::Resolve: Relative position "
- << start << " exceeds length (" << total_length
- << ") of parent location " << label);
- } catch (sequence::CNoLength) {
- ERR_POST(Warning << "SRelLoc::Resolve: Relative position "
- << start
- << " exceeds length (???) of parent location "
- << label);
- }
- }
- }
- // clean up output
- switch (mix.Get().size()) {
- case 0:
- result->SetNull();
- break;
- case 1:
- result.Reset(mix.Set().front());
- break;
- default:
- break;
- }
- return result;
- }
- //============================================================================//
- // SeqSearch //
- //============================================================================//
- // Public:
- // =======
- // Constructors and Destructors:
- CSeqSearch::CSeqSearch(IClient *client, bool allow_mismatch) :
- m_AllowOneMismatch(allow_mismatch),
- m_MaxPatLen(0),
- m_Client(client)
- {
- InitializeMaps();
- }
- CSeqSearch::~CSeqSearch(void) {}
- // Add nucleotide pattern or restriction site to sequence search.
- // Uses ambiguity codes, e.g., R = A and G, H = A, C and T
- void CSeqSearch::AddNucleotidePattern
- (const string& name,
- const string& pat,
- int cut_site,
- int overhang)
- {
- string pattern = pat;
- NStr::TruncateSpaces(pattern);
- NStr::ToUpper(pattern);
- // reverse complement pattern to see if it is symetrical
- string rcomp = ReverseComplement(pattern);
- bool symmetric = (pattern == rcomp);
- ENa_strand strand = symmetric ? eNa_strand_both : eNa_strand_plus;
- AddNucleotidePattern(name, pat, cut_site, overhang, strand);
- if ( !symmetric ) {
- AddNucleotidePattern(name, rcomp, pat.length() - cut_site,
- overhang, eNa_strand_minus);
- }
- }
- // Program passes each character in turn to finite state machine.
- int CSeqSearch::Search
- (int current_state,
- char ch,
- int position,
- int length)
- {
- if ( !m_Client ) return 0;
- // on first character, populate state transition table
- if ( !m_Fsa.IsPrimed() ) {
- m_Fsa.Prime();
- }
-
- int next_state = m_Fsa.GetNextState(current_state, ch);
-
- // report any matches at current state to the client object
- if ( m_Fsa.IsMatchFound(next_state) ) {
- ITERATE( vector<CMatchInfo>, it, m_Fsa.GetMatches(next_state) ) {
- //const CMatchInfo& match = *it;
- int start = position - it->GetPattern().length() + 1;
- // prevent multiple reports of patterns for circular sequences.
- if ( start < length ) {
- m_Client->MatchFound(*it, start);
- }
- }
- }
- return next_state;
- }
- // Search an entire bioseq.
- void CSeqSearch::Search(const CBioseq_Handle& bsh)
- {
- if ( !bsh ) return;
- if ( !m_Client ) return; // no one to report to, so why search at all.
- CSeqVector seq_vec = bsh.GetSeqVector(CBioseq_Handle::eCoding_Iupac);
- size_t seq_len = seq_vec.size();
- size_t search_len = seq_len;
- CSeq_inst::ETopology topology =
- bsh.GetBioseqCore()->GetInst().GetTopology();
- if ( topology == CSeq_inst::eTopology_circular ) {
- search_len += m_MaxPatLen - 1;
- }
-
- int state = m_Fsa.GetInitialState();
- for ( size_t i = 0; i < search_len; ++i ) {
- state = Search(state, seq_vec[i % seq_len], i, seq_len);
- }
- }
- // Private:
- // ========
- // translation finite state machine base codes - ncbi4na
- enum EBaseCode {
- eBase_gap = 0,
- eBase_A, /* A */
- eBase_C, /* C */
- eBase_M, /* AC */
- eBase_G, /* G */
- eBase_R, /* AG */
- eBase_S, /* CG */
- eBase_V, /* ACG */
- eBase_T, /* T */
- eBase_W, /* AT */
- eBase_Y, /* CT */
- eBase_H, /* ACT */
- eBase_K, /* GT */
- eBase_D, /* AGT */
- eBase_B, /* CGT */
- eBase_N /* ACGT */
- };
- map<unsigned char, int> CSeqSearch::sm_CharToEnum;
- map<int, unsigned char> CSeqSearch::sm_EnumToChar;
- map<char, char> CSeqSearch::sm_Complement;
- void CSeqSearch::InitializeMaps(void)
- {
- int c, i;
- static const string iupacna_alphabet = "-ACMGRSVTWYHKDBN";
- static const string comp_iupacna_alphabet = "-TGKCYSBAWRDMHVN";
- if ( sm_CharToEnum.empty() ) {
- // illegal characters map to eBase_gap (0)
- for ( c = 0; c < 256; ++c ) {
- sm_CharToEnum[c] = eBase_gap;
- }
-
- // map iupacna alphabet to EBaseCode
- for (i = eBase_gap; i <= eBase_N; ++i) {
- c = iupacna_alphabet[i];
- sm_CharToEnum[c] = (EBaseCode)i;
- c = tolower(c);
- sm_CharToEnum[c] = (EBaseCode)i;
- }
- sm_CharToEnum ['U'] = eBase_T;
- sm_CharToEnum ['u'] = eBase_T;
- sm_CharToEnum ['X'] = eBase_N;
- sm_CharToEnum ['x'] = eBase_N;
-
- // also map ncbi4na alphabet to EBaseCode
- for (c = eBase_gap; c <= eBase_N; ++c) {
- sm_CharToEnum[c] = (EBaseCode)c;
- }
- }
-
- // map EBaseCode to iupacna alphabet
- if ( sm_EnumToChar.empty() ) {
- for (i = eBase_gap; i <= eBase_N; ++i) {
- sm_EnumToChar[i] = iupacna_alphabet[i];
- }
- }
- // initialize table to convert character to complement character
- if ( sm_Complement.empty() ) {
- int len = iupacna_alphabet.length();
- for ( i = 0; i < len; ++i ) {
- sm_Complement.insert(make_pair(iupacna_alphabet[i], comp_iupacna_alphabet[i]));
- }
- }
- }
- string CSeqSearch::ReverseComplement(const string& pattern) const
- {
- size_t len = pattern.length();
- string rcomp = pattern;
- rcomp.resize(len);
- // calculate the complement
- for ( size_t i = 0; i < len; ++i ) {
- rcomp[i] = sm_Complement[pattern[i]];
- }
- // reverse the complement
- reverse(rcomp.begin(), rcomp.end());
- return rcomp;
- }
- void CSeqSearch::AddNucleotidePattern
- (const string& name,
- const string& pattern,
- int cut_site,
- int overhang,
- ENa_strand strand)
- {
- size_t pat_len = pattern.length();
- string temp;
- temp.resize(pat_len);
- CMatchInfo info(name,
- CNcbiEmptyString::Get(),
- cut_site,
- overhang,
- strand);
- if ( pat_len > m_MaxPatLen ) {
- m_MaxPatLen = pat_len;
- }
- ExpandPattern(pattern, temp, 0, pat_len, info);
- }
- void CSeqSearch::ExpandPattern
- (const string& pattern,
- string& temp,
- int position,
- int pat_len,
- CMatchInfo& info)
- {
- static EBaseCode expension[] = { eBase_A, eBase_C, eBase_G, eBase_T };
- if ( position < pat_len ) {
- int code = sm_CharToEnum[(int)pattern[position]];
-
- for ( int i = 0; i < 4; ++i ) {
- if ( code & expension[i] ) {
- temp[position] = sm_EnumToChar[expension[i]];
- ExpandPattern(pattern, temp, position + 1, pat_len, info);
- }
- }
- } else { // recursion base
- // when position reaches pattern length, store one expanded string.
- info.m_Pattern = temp;
- m_Fsa.AddWord(info.m_Pattern, info);
- if ( m_AllowOneMismatch ) {
- char ch;
- // put 'N' at every position if a single mismatch is allowed.
- for ( int i = 0; i < pat_len; ++i ) {
- ch = temp[i];
- temp[i] = 'N';
- info.m_Pattern = temp;
- m_Fsa.AddWord(pattern, info);
- // restore proper character, go on to put N in next position.
- temp[i] = ch;
- }
- }
- }
- }
- END_SCOPE(objects)
- END_NCBI_SCOPE
- /*
- * ===========================================================================
- * $Log: sequence.cpp,v $
- * Revision 1000.3 2004/06/01 19:25:30 gouriano
- * PRODUCTION: UPGRADED [GCC34_MSVC7] Dev-tree R1.82
- *
- * Revision 1.82 2004/05/27 13:48:24 jcherry
- * Replaced some calls to deprecated CBioseq_Handle::GetBioseq() with
- * GetCompleteBioseq() or GetBioseqCore()
- *
- * Revision 1.81 2004/05/25 21:06:34 johnson
- * Bug fix in x_Translate
- *
- * Revision 1.80 2004/05/21 21:42:14 gorelenk
- * Added PCH ncbi_pch.hpp
- *
- * Revision 1.79 2004/05/06 17:39:43 shomrat
- * Fixes to SeqLocMerge
- *
- * Revision 1.78 2004/04/06 14:03:15 dicuccio
- * Added API to extract the single Org-ref from a bioseq handle. Added API to
- * retrieve a single tax-id from a bioseq handle
- *
- * Revision 1.77 2004/04/05 15:56:14 grichenk
- * Redesigned CAnnotTypes_CI: moved all data and data collecting
- * functions to CAnnotDataCollector. CAnnotTypes_CI is no more
- * inherited from SAnnotSelector.
- *
- * Revision 1.76 2004/03/25 20:02:30 vasilche
- * Added several method variants with CBioseq_Handle as argument.
- *
- * Revision 1.75 2004/03/01 18:28:18 dicuccio
- * Changed sequence::Compare() such that a seq-interval of length 1 and a
- * corresponding seq-point compare as the same
- *
- * Revision 1.74 2004/03/01 18:24:22 shomrat
- * Removed branching in the main cdregion translation loop; Added alternative start flag
- *
- * Revision 1.73 2004/02/19 18:16:48 shomrat
- * Implemented SeqlocRevCmp
- *
- * Revision 1.72 2004/02/09 14:43:18 vasilche
- * Added missing typename keyword.
- *
- * Revision 1.71 2004/02/05 19:41:46 shomrat
- * Convenience functions for popular overlapping types
- *
- * Revision 1.70 2004/02/04 18:05:41 grichenk
- * Added annotation filtering by set of types/subtypes.
- * Renamed *Choice to *Type in SAnnotSelector.
- *
- * Revision 1.69 2004/01/30 17:49:29 ucko
- * Add missing "typename"
- *
- * Revision 1.68 2004/01/30 17:22:53 dicuccio
- * Added sequence::GetId(const CBioseq_Handle&) - returns a selected ID class
- * (best, GI). Added CSeqTranslator - utility class to translate raw sequence
- * data
- *
- * Revision 1.67 2004/01/28 17:19:04 shomrat
- * Implemented SeqLocMerge
- *
- * Revision 1.66 2003/12/16 19:37:43 shomrat
- * Retrieve encoding feature and bioseq of a protein
- *
- * Revision 1.65 2003/11/19 22:18:05 grichenk
- * All exceptions are now CException-derived. Catch "exception" rather
- * than "runtime_error".
- *
- * Revision 1.64 2003/10/17 20:55:27 ucko
- * SRelLoc::Resolve: fix a fuzz-handling paste-o.
- *
- * Revision 1.63 2003/10/16 11:55:19 dicuccio
- * Fix for brain-dead MSVC and ambiguous operator&&
- *
- * Revision 1.62 2003/10/15 19:52:18 ucko
- * More adjustments to SRelLoc: support fuzz, opposite-strand children,
- * and resolving against an alternate parent.
- *
- * Revision 1.61 2003/10/08 21:08:38 ucko
- * CCdregion_translate: take const Bioseq_Handles, since there's no need
- * to modify them.
- * TestForOverlap: don't consider sequences circular if
- * circular_len == kInvalidSeqPos
- *
- * Revision 1.60 2003/09/22 18:38:14 grichenk
- * Fixed circular seq-locs processing by TestForOverlap()
- *
- * Revision 1.59 2003/07/17 21:00:50 vasilche
- * Added missing include <list>
- *
- * Revision 1.58 2003/06/19 17:11:43 ucko
- * SRelLoc::SRelLoc: remember to update the base position when skipping
- * parent ranges whose IDs don't match.
- *
- * Revision 1.57 2003/06/13 17:23:32 grichenk
- * Added special processing of multi-ID seq-locs in TestForOverlap()
- *
- * Revision 1.56 2003/06/05 18:08:36 shomrat
- * Allow empty location when computing SeqLocPartial; Adjust GetSeqData in cdregion translation
- *
- * Revision 1.55 2003/06/02 18:58:25 dicuccio
- * Fixed #include directives
- *
- * Revision 1.54 2003/06/02 18:53:32 dicuccio
- * Fixed bungled commit
- *
- * Revision 1.52 2003/05/27 19:44:10 grichenk
- * Added CSeqVector_CI class
- *
- * Revision 1.51 2003/05/15 19:27:02 shomrat
- * Compare handle only if both valid; Check IsLim before GetLim
- *
- * Revision 1.50 2003/05/09 15:37:00 ucko
- * Take const CBioseq_Handle references in CFastaOstream::Write et al.
- *
- * Revision 1.49 2003/05/06 19:34:36 grichenk
- * Fixed GetStrand() (worked fine only for plus and unknown strands)
- *
- * Revision 1.48 2003/05/01 17:55:17 ucko
- * Fix GetLength(const CSeq_id&, CScope*) to return ...::max() rather
- * than throwing if it can't resolve the ID to a handle.
- *
- * Revision 1.47 2003/04/24 16:15:58 vasilche
- * Added missing includes and forward class declarations.
- *
- * Revision 1.46 2003/04/16 19:44:26 grichenk
- * More fixes to TestForOverlap() and GetStrand()
- *
- * Revision 1.45 2003/04/15 20:11:21 grichenk
- * Fixed strands problem in TestForOverlap(), replaced type
- * iterator with container iterator in s_GetStrand().
- *
- * Revision 1.44 2003/04/03 19:03:17 grichenk
- * Two more cases to revert locations in GetBestOverlappingFeat()
- *
- * Revision 1.43 2003/04/02 16:58:59 grichenk
- * Change location and feature in GetBestOverlappingFeat()
- * for eOverlap_Subset.
- *
- * Revision 1.42 2003/03/25 22:00:20 grichenk
- * Added strand checking to TestForOverlap()
- *
- * Revision 1.41 2003/03/18 21:48:35 grichenk
- * Removed obsolete class CAnnot_CI
- *
- * Revision 1.40 2003/03/11 16:00:58 kuznets
- * iterate -> ITERATE
- *
- * Revision 1.39 2003/02/19 16:25:14 grichenk
- * Check strands in GetBestOverlappingFeat()
- *
- * Revision 1.38 2003/02/14 15:41:00 shomrat
- * Minor implementation changes in SeqLocPartialTest
- *
- * Revision 1.37 2003/02/13 14:35:40 grichenk
- * + eOverlap_Contains
- *
- * Revision 1.36 2003/02/10 15:54:01 grichenk
- * Use CFeat_CI->GetMappedFeature() and GetOriginalFeature()
- *
- * Revision 1.35 2003/02/06 22:26:27 vasilche
- * Use CSeq_id::Assign().
- *
- * Revision 1.34 2003/02/06 20:59:16 shomrat
- * Bug fix in SeqLocPartialCheck
- *
- * Revision 1.33 2003/01/22 21:02:23 ucko
- * Fix stupid logic error in SRelLoc's constructor; change LocationOffset
- * to return -1 rather than crashing if the locations don't intersect.
- *
- * Revision 1.32 2003/01/22 20:15:02 vasilche
- * Removed compiler warning.
- *
- * Revision 1.31 2003/01/22 18:17:09 ucko
- * SRelLoc::SRelLoc: change intersection to a CRef, so we don't have to
- * worry about it going out of scope while still referenced (by m_Ranges).
- *
- * Revision 1.30 2003/01/08 20:43:10 ucko
- * Adjust SRelLoc to use (ID-less) Seq-intervals for ranges, so that it
- * will be possible to add support for fuzz and strandedness/orientation.
- *
- * Revision 1.29 2002/12/30 19:38:35 vasilche
- * Optimized CGenbankWriter::WriteSequence.
- * Implemented GetBestOverlappingFeat() with CSeqFeatData::ESubtype selector.
- *
- * Revision 1.28 2002/12/26 21:45:29 grichenk
- * + GetBestOverlappingFeat()
- *
- * Revision 1.27 2002/12/26 21:17:06 dicuccio
- * Minor tweaks to avoid compiler warnings in MSVC (remove unused variables)
- *
- * Revision 1.26 2002/12/20 17:14:18 kans
- * added SeqLocPartialCheck
- *
- * Revision 1.25 2002/12/19 20:24:55 grichenk
- * Updated usage of CRange<>
- *
- * Revision 1.24 2002/12/10 16:56:41 ucko
- * CFastaOstream::WriteTitle: restore leading > accidentally dropped in R1.19.
- *
- * Revision 1.23 2002/12/10 16:34:45 kans
- * implement code break processing in TranslateCdregion
- *
- * Revision 1.22 2002/12/09 20:38:41 ucko
- * +sequence::LocationOffset
- *
- * Revision 1.21 2002/12/06 15:36:05 grichenk
- * Added overlap type for annot-iterators
- *
- * Revision 1.20 2002/12/04 15:38:22 ucko
- * SourceToProduct, ProductToSource: just check whether the feature is a coding
- * region rather than attempting to determine molecule types; drop s_DeduceMol.
- *
- * Revision 1.19 2002/11/26 15:14:04 dicuccio
- * Changed CFastaOStream::WriteTitle() to make use of CSeq_id::GetStringDescr().
- *
- * Revision 1.18 2002/11/25 21:24:46 grichenk
- * Added TestForOverlap() function.
- *
- * Revision 1.17 2002/11/18 19:59:23 shomrat
- * Add CSeqSearch - a nucleotide search utility
- *
- * Revision 1.16 2002/11/12 20:00:25 ucko
- * +SourceToProduct, ProductToSource, SRelLoc
- *
- * Revision 1.15 2002/10/23 19:22:39 ucko
- * Move the FASTA reader from objects/util/sequence.?pp to
- * objects/seqset/Seq_entry.?pp because it doesn't need the OM.
- *
- * Revision 1.14 2002/10/23 18:23:59 ucko
- * Clean up #includes.
- * Add a FASTA reader (known to compile, but not otherwise tested -- take care)
- *
- * Revision 1.13 2002/10/23 16:41:12 clausen
- * Fixed warning in WorkShop53
- *
- * Revision 1.12 2002/10/23 15:33:50 clausen
- * Fixed local variable scope warnings
- *
- * Revision 1.11 2002/10/08 12:35:37 clausen
- * Fixed bugs in GetStrand(), ChangeSeqId() & SeqLocCheck()
- *
- * Revision 1.10 2002/10/07 17:11:16 ucko
- * Fix usage of ERR_POST (caught by KCC)
- *
- * Revision 1.9 2002/10/03 18:44:09 clausen
- * Removed extra whitespace
- *
- * Revision 1.8 2002/10/03 16:33:55 clausen
- * Added functions needed by validate
- *
- * Revision 1.7 2002/09/13 15:30:43 ucko
- * Change resize(0) to erase() at Denis's suggestion.
- *
- * Revision 1.6 2002/09/13 14:28:34 ucko
- * string::clear() doesn't exist under g++ 2.95, so use resize(0) instead. :-/
- *
- * Revision 1.5 2002/09/12 21:39:13 kans
- * added CCdregion_translate and CCdregion_translate
- *
- * Revision 1.4 2002/09/03 21:27:04 grichenk
- * Replaced bool arguments in CSeqVector constructor and getters
- * with enums.
- *
- * Revision 1.3 2002/08/27 21:41:15 ucko
- * Add CFastaOstream.
- *
- * Revision 1.2 2002/06/07 16:11:09 ucko
- * Move everything into the "sequence" namespace.
- * Drop the anonymous-namespace business; "sequence" should provide
- * enough protection, and anonymous namespaces ironically interact poorly
- * with WorkShop's caching when rebuilding shared libraries.
- *
- * Revision 1.1 2002/06/06 18:41:41 clausen
- * Initial version
- *
- * ===========================================================================
- */