Java源码示例:htsjdk.samtools.AlignmentBlock

示例1
/**
 * Get the LocusFunction on a gene by gene basis for the alignment blocks of this read.
 * If a read is split into multiple blocks, each block should point to the same gene - if blocks point to
 * different genes (which can't happen, you can't splice different genes together), then that gene result should not be returned.
 * Instead, return null in those cases.
 * @param alignmentBlocks
 * @param g
 * @return
 */
private LocusFunction getLocusFunctionForRead (final SAMRecord rec, final Gene g) {
	List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();

       LocusFunction [] blockSummaryFunction = new LocusFunction[alignmentBlocks.size()];
       Set<Gene> temp = new HashSet<>();
       temp.add(g);

       for (int i=0; i<alignmentBlocks.size(); i++) {
       	AlignmentBlock alignmentBlock =alignmentBlocks.get(i);

       	LocusFunction [] blockFunctions=getLocusFunctionsByBlock(alignmentBlock, temp);
       	LocusFunction blockFunction = getLocusFunction(blockFunctions, false);
       	blockSummaryFunction[i]=blockFunction;
       }

       LocusFunction readFunction = getLocusFunction(blockSummaryFunction, false);
	return readFunction;
}
 
示例2
/**
 * For each alignment block, see which genes have exons that intersect it.
 * Only count genes where the alignment blocks are consistent - either the alignment blocks point to the same gene, or
 * an alignment block points to a gene and the other to no genes/exons in the set.
 * Note that this is not strand specific, which may cause problems if a read overlaps two genes on opposite strands that have overlapping exons.
 * @param rec
 * @param genes A set of genes that the read originally overlaps.
 * @param allowMultipleGenes.  If false, and a read overlaps multiple gene exons, then none of the genes are returned.  If true, return the set of all genes.
 * @return
 */
//TODO: if this is used in the future, make it strand specific.
public Set<Gene> getConsistentExons (final SAMRecord rec, final Set<Gene> genes, final boolean allowMultiGeneReads) {

	Set<Gene> result = new HashSet<>();
	String refName = rec.getReferenceName();
	List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();
	for (AlignmentBlock b: alignmentBlocks) {
		Set<Gene> blockGenes = getAlignmentBlockonGeneExon(refName, b, genes);
		result.addAll(blockGenes);
		/*
		// if result is not empty and blockGenes isn't empty, intersect the set and set the result as this new set.
		if (result.size()>0 && blockGenes.size()>0) {
			if (allowMultiGeneReads) result.addAll(blockGenes);
			if (!allowMultiGeneReads)  result.retainAll(blockGenes);
		} else
			// if blockGenes is populated and you're here, then result is empty, so set result to these results
			result=blockGenes;
		*/
	}
	if (!allowMultiGeneReads & result.size()>1) return new HashSet<>();
	return result;
}
 
示例3
private SAMRecord tagRead(final SAMRecord r, final OverlapDetector<Interval> od) {
	Set<Interval>intervals = new HashSet<>();
	List<AlignmentBlock> blocks = r.getAlignmentBlocks();
	for (AlignmentBlock b: blocks) {
		int refStart =b.getReferenceStart();
		int refEnd = refStart+b.getLength()-1;
		Interval v = new Interval(r.getReferenceName(), refStart, refEnd);
		Collection<Interval> blockResult = od.getOverlaps(v);
		intervals.addAll(blockResult);
	}
	String tagName = getIntervalName(intervals);
	if (tagName!=null)
		r.setAttribute(this.TAG, tagName);
	else
		r.setAttribute(this.TAG, null);
	return (r);

}
 
示例4
/**
 * Check if a read overlaps any SNPs in the OverlapDetector.  Tag reads with SNPs.
 * If more than 1 SNP tags a read, make a read for each SNP.
 * Simplified since data goes through GeneFunctionIteratorWrapper to take care of how reads/genes interact.
 */
private void processSNP (final SAMRecord r) {
	List<AlignmentBlock> blocks = r.getAlignmentBlocks();

	Collection<String> snps = new HashSet<>();

	for (AlignmentBlock b: blocks) {
		int start = b.getReferenceStart();
		int end = start + b.getLength() -1;

		Interval i = new Interval(r.getReferenceName(), start, end);
		Collection<Interval> overlaps = this.snpIntervals.getOverlaps(i);
		for (Interval o: overlaps)
			snps.add(IntervalTagComparator.toString(o));
	}

	// 1 read per SNP.
	for (String snp:snps) {
		SAMRecord rr = Utils.getClone(r);
		rr.setAttribute(this.snpTag, snp);
		queueRecordForOutput(rr);
	}
}
 
示例5
/**
 * For a read, split into the alignment blocks.
 * For each alignment block, determine the genes the block overlaps, and the locus function of those genes.
 *
 * Each alignment block can generate a different functional annotation on the same gene.  These should be retained.
 * For example, block one can align to an exon, and block two to an intron, and both those annotations are retained.
 *
 * Only retain genes where alignment blocks all reference that gene.  If block one refers to genes A,B and block two to gene A only, then only retain gene A.
 *
 */
public Map<Gene, List<LocusFunction>> getFunctionalDataForRead (final SAMRecord rec, final OverlapDetector<Gene> geneOverlapDetector) {
	List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();

	Map<AlignmentBlock, Map<Gene, List<LocusFunction>>> map = new HashMap<>();
	// gather the locus functions for each alignment block.
	for (AlignmentBlock block: alignmentBlocks) {
		Interval interval = getInterval(rec.getReferenceName(), block);
		Map<Gene, List<LocusFunction>> locusFunctionsForGeneMap = getFunctionalDataForInterval(interval, geneOverlapDetector);
		map.put(block, locusFunctionsForGeneMap);
	}
	// simplify genes by only using genes that are common to all alignment blocks.
	Map<Gene, List<LocusFunction>> result = simplifyFunctionalDataAcrossAlignmentBlocks(map);
	return result;
}
 
示例6
private Set<Gene> getAlignmentBlockonGeneExon(final String refName, final AlignmentBlock b, final Set<Gene> genes) {
	Set<Gene> result = new HashSet<>();
	for (Gene g: genes)
		if (getAlignmentBlockOverlapsExon(refName, b, g))
			result.add(g);
	return (result);
}
 
示例7
private boolean getAlignmentBlockOverlapsExon (final String refName, final AlignmentBlock b, final Gene g) {

		Interval ib = getInterval(refName, b);

		for (Transcript t: g)
			for (Exon e: t.exons) {
				Interval ei = getInterval(refName, e);
				if (ib.intersects(ei))
					return true;
			}
		return false;
	}
 
示例8
public LocusFunction [] getLocusFunctionsByBlock (final AlignmentBlock b, final Collection<Gene> overlappingGenes) {
	// Get functional class for each position in the alignment block.
       final LocusFunction[] locusFunctions = new LocusFunction[b.getLength()];

       // By default, if base does not overlap with rRNA or gene, it is intergenic.
       Arrays.fill(locusFunctions, 0, locusFunctions.length, LocusFunction.INTERGENIC);

       for (final Gene gene : overlappingGenes)
		for (final Gene.Transcript transcript : gene)
			transcript.assignLocusFunctionForRange(b.getReferenceStart(), locusFunctions);
       return locusFunctions;
}
 
示例9
/**
 * Returns 1-based index of first base in read that corresponds to M in CIGAR string.
 * Note that first is relative to 5' end, so that for reverse-strand alignment, the index of
 * the last base aligned is computed relative to the end of the read.
 */
int getIndexOfFirstAlignedBase(final SAMRecord rec) {
    final List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();
    if (rec.getReadNegativeStrandFlag()) {
        final AlignmentBlock alignmentBlock = alignmentBlocks.get(alignmentBlocks.size() - 1);
        return rec.getReadLength() - CoordMath.getEnd(alignmentBlock.getReadStart(), alignmentBlock.getLength()) + 1;
    } else {
        return alignmentBlocks.get(0).getReadStart();
    }
}
 
示例10
/** Get the the number of bases in the given alignment block and record that have base quality greater or equal to the minimum */
public static int getNumBasesPassingMinimumBaseQuality(final SAMRecord record, final AlignmentBlock block, final int minimumBaseQuality) {
    int basesInBlockAtMinimumQuality = 0;
    final byte[] baseQualities = record.getBaseQualities();
    for (int idx = block.getReadStart(); idx <= CoordMath.getEnd(block.getLength(), block.getReadStart()); idx++) { // idx is one-based
        if (minimumBaseQuality <= baseQualities[idx-1]) basesInBlockAtMinimumQuality++;
    }
    return basesInBlockAtMinimumQuality;
}
 
示例11
@Override
public final boolean filterOut(final SAMRecord record) {
    final boolean filteredOut = reallyFilterOut(record);
    if (filteredOut) {
        ++filteredRecords;
        for (final AlignmentBlock block : record.getAlignmentBlocks()) {
            this.filteredBases += block.getLength();
        }
    }
    return filteredOut;
}
 
示例12
/**
 * Returns 1-based index of first base in read that corresponds to M in CIGAR string.
 * Note that first is relative to 5' end, so that for reverse-strand alignment, the index of
 * the last base aligned is computed relative to the end of the read.
 */
int getIndexOfFirstAlignedBase(final SAMRecord rec) {
    final List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();
    if (rec.getReadNegativeStrandFlag()) {
        final AlignmentBlock alignmentBlock = alignmentBlocks.get(alignmentBlocks.size() - 1);
        return rec.getReadLength() - CoordMath.getEnd(alignmentBlock.getReadStart(), alignmentBlock.getLength()) + 1;
    } else {
        return alignmentBlocks.get(0).getReadStart();
    }
}
 
示例13
private Interval getInterval (final String refName, final AlignmentBlock b) {
	int s = b.getReferenceStart();
	int e = s + b.getLength()-1;
	Interval i = new Interval(refName, s,e);
	return (i);
}
 
示例14
/**
 * For a read on this pileup, get the base and quality of the base that is at the same
 * position as the SNP.  If the read does not overlap the interval, then return an empty array.
 * @param r The read
 * @return A length 2 byte array containing the base and quality.  Empty if the read does not overlap the interval.
 */
public byte [] getBaseAndQualityOverlappingInterval (final SAMRecord r) {
	byte [] result = new byte [2];

	List<CigarElement> elements = r.getCigar().getCigarElements();
	Iterator<AlignmentBlock> blocks = r.getAlignmentBlocks().iterator();

	int finalSNPLocalPosition=-1;
	int lengthTraversed=0;

	for (CigarElement ce: elements) {
		CigarOperator co = ce.getOperator();
		// you're in an alignment block
		if (co==CigarOperator.M) {
			// get the next alignment block
			AlignmentBlock b = blocks.next();
			int refStart = b.getReferenceStart();
			int snpLocalPos=this.getPosition() - refStart +1;
			int blockLength=b.getLength();

			// is the local position inside this alignment block?
			// if not, move onto the next block.
			if (snpLocalPos >0 && snpLocalPos<=blockLength) {
				// found it!  Done.
				finalSNPLocalPosition=snpLocalPos+lengthTraversed;
				break;
			}
		}
		// consume the bases if necessary and move on to the next element.
		if (co.consumesReadBases())
			lengthTraversed+=ce.getLength();
	}

	// if the position is assigned, then add to the pileup.
	if (finalSNPLocalPosition!=-1) {
		// arrays 0 based.
		byte [] quals = r.getBaseQualities();
		byte [] bases = r.getReadBases();
		byte base = bases[finalSNPLocalPosition-1];
		// char baseChar = StringUtil.byteToChar(base);
		byte qual = quals[finalSNPLocalPosition-1];
		result[0]=base;
		result[1]=qual;
		return (result);
	}
	return (ArrayUtils.EMPTY_BYTE_ARRAY);
}
 
示例15
@Override
protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) {
    // see if the whole read should be skipped
    if (recordFilter.filterOut(rec)) return;

    // check read group + library
    final String library = (rec.getReadGroup() == null) ? UNKNOWN_LIBRARY : getOrElse(rec.getReadGroup().getLibrary(), UNKNOWN_LIBRARY);
    if (!libraries.contains(library)) {
        // should never happen if SAM is valid
        throw new PicardException("Record contains library that is missing from header: " + library);
    }

    // set up some constants that don't change in the loop below
    final int contextFullLength = 2 * CONTEXT_SIZE + 1;
    final ArtifactCounter counter = artifactCounters.get(library);
    final byte[] readBases = rec.getReadBases();
    final byte[] readQuals;
    if (USE_OQ) {
        final byte[] tmp = rec.getOriginalBaseQualities();
        readQuals = tmp == null ? rec.getBaseQualities() : tmp;
    } else {
        readQuals = rec.getBaseQualities();
    }

    // iterate over aligned positions
    for (final AlignmentBlock block : rec.getAlignmentBlocks()) {
        for (int offset = 0; offset < block.getLength(); offset++) {
            // remember, these are 1-based!
            final int readPos = block.getReadStart() + offset;
            final int refPos = block.getReferenceStart() + offset;

            // skip low BQ sites
            final byte qual = readQuals[readPos - 1];
            if (qual < MINIMUM_QUALITY_SCORE) continue;

            // skip N bases in read
            final char readBase = Character.toUpperCase((char)readBases[readPos - 1]);
            if (readBase == 'N') continue;

            /**
             * Skip regions outside of intervals.
             *
             * NB: IntervalListReferenceSequenceMask.get() has side-effects which assume
             * that successive ReferenceSequence's passed to this method will be in-order
             * (e.g. it will break if you call acceptRead() with chr1, then chr2, then chr1
             * again). So this only works if the underlying iteration is coordinate-sorted.
             */
            if (intervalMask != null && !intervalMask.get(ref.getContigIndex(), refPos)) continue;

            // skip dbSNP sites
            if (dbSnpMask != null && dbSnpMask.isDbSnpSite(ref.getName(), refPos)) continue;

            // skip the ends of the reference
            final int contextStartIndex = refPos - CONTEXT_SIZE - 1;
            if (contextStartIndex < 0 || contextStartIndex + contextFullLength > ref.length()) continue;

            // skip contexts with N bases
            final String context = getRefContext(ref, contextStartIndex, contextFullLength);
            if (context.contains("N")) continue;

            // skip non-ACGT bases
            if (!SequenceUtil.isUpperACGTN((byte)readBase))
                continue;

            // count the base!
            counter.countRecord(context, readBase, rec);
        }
    }
}
 
示例16
public void acceptRecord(final SAMRecordAndReference args) {
	mappedRecordCount++;

	final SAMRecord samRecord = args.getSamRecord();
	final ReferenceSequence referenceSequence = args.getReferenceSequence();

	final byte[] readBases = samRecord.getReadBases();
	final byte[] readQualities = samRecord.getBaseQualities();
	final byte[] refBases = referenceSequence.getBases();

	if (samRecord.getReadLength() < minReadLength) {
		smallReadCount++;
		return;
	} else if (SequenceUtil.countMismatches(samRecord, refBases, true) > Math.round(samRecord.getReadLength() * maxMismatchRate)) {
		mismatchCount++;
		return;
	}

	// We only record non-CpG C sites if there was at least one CpG in the read, keep track of
	// the values for this record and then apply to the global totals if valid
	int recordCpgs = 0;

	for (final AlignmentBlock alignmentBlock : samRecord.getAlignmentBlocks()) {
		final int blockLength = alignmentBlock.getLength();
		final int refFragmentStart = alignmentBlock.getReferenceStart() - 1;
		final int readFragmentStart = alignmentBlock.getReadStart() - 1;

		final byte[] refFragment = getFragment(refBases, refFragmentStart, blockLength);
		final byte[] readFragment = getFragment(readBases, readFragmentStart, blockLength);
		final byte[] readQualityFragment = getFragment(readQualities, readFragmentStart, blockLength);

		if (samRecord.getReadNegativeStrandFlag()) {
			// In the case of a negative strand, reverse (and complement for base arrays) the reference,
			// reads & qualities so that it can be treated as a positive strand for the rest of the process
			SequenceUtil.reverseComplement(refFragment);
			SequenceUtil.reverseComplement(readFragment);
			SequenceUtil.reverseQualities(readQualityFragment);
		}

		for (int i=0; i < blockLength-1; i++) {
			final int curRefIndex = getCurRefIndex(refFragmentStart, blockLength, i, samRecord.getReadNegativeStrandFlag());

			// Look at a 2-base window to see if we're on a CpG site, and if so check for conversion
			// (CG -> TG). We do not consider ourselves to be on a CpG site if we're on the last base of a read
			if ((SequenceUtil.basesEqual(refFragment[i], SequenceUtil.C)) &&
					(SequenceUtil.basesEqual(refFragment[i+1], SequenceUtil.G))) {
				// We want to catch the case where there's a CpG in the reference, even if it is not valid
				// to prevent the C showing up as a non-CpG C down below. Otherwise this could have been all
				// in one if statement
				if (isValidCpg(refFragment, readFragment, readQualityFragment, i)) {
					recordCpgs++;
					final CpgLocation curLocation = new CpgLocation(samRecord.getReferenceName(), curRefIndex);
					cpgTotal.increment(curLocation);
					if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) {
						cpgConverted.increment(curLocation);
					}
				}
				i++;
			} else if (isC(refFragment[i], readFragment[i]) && isAboveCytoQcThreshold(readQualities, i) &&
					SequenceUtil.bisulfiteBasesEqual(false, readFragment[i+1], refFragment[i+1])) {
				// C base in the reference that's not associated with a CpG
				nCytoTotal++;
				if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) {
					nCytoConverted++;
				}
			}
		}
	}

	if (recordCpgs == 0) {
		noCpgCount++;
	}
}