Java源码示例:htsjdk.samtools.AlignmentBlock
示例1
/**
* Get the LocusFunction on a gene by gene basis for the alignment blocks of this read.
* If a read is split into multiple blocks, each block should point to the same gene - if blocks point to
* different genes (which can't happen, you can't splice different genes together), then that gene result should not be returned.
* Instead, return null in those cases.
* @param alignmentBlocks
* @param g
* @return
*/
private LocusFunction getLocusFunctionForRead (final SAMRecord rec, final Gene g) {
List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();
LocusFunction [] blockSummaryFunction = new LocusFunction[alignmentBlocks.size()];
Set<Gene> temp = new HashSet<>();
temp.add(g);
for (int i=0; i<alignmentBlocks.size(); i++) {
AlignmentBlock alignmentBlock =alignmentBlocks.get(i);
LocusFunction [] blockFunctions=getLocusFunctionsByBlock(alignmentBlock, temp);
LocusFunction blockFunction = getLocusFunction(blockFunctions, false);
blockSummaryFunction[i]=blockFunction;
}
LocusFunction readFunction = getLocusFunction(blockSummaryFunction, false);
return readFunction;
}
示例2
/**
* For each alignment block, see which genes have exons that intersect it.
* Only count genes where the alignment blocks are consistent - either the alignment blocks point to the same gene, or
* an alignment block points to a gene and the other to no genes/exons in the set.
* Note that this is not strand specific, which may cause problems if a read overlaps two genes on opposite strands that have overlapping exons.
* @param rec
* @param genes A set of genes that the read originally overlaps.
* @param allowMultipleGenes. If false, and a read overlaps multiple gene exons, then none of the genes are returned. If true, return the set of all genes.
* @return
*/
//TODO: if this is used in the future, make it strand specific.
public Set<Gene> getConsistentExons (final SAMRecord rec, final Set<Gene> genes, final boolean allowMultiGeneReads) {
Set<Gene> result = new HashSet<>();
String refName = rec.getReferenceName();
List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();
for (AlignmentBlock b: alignmentBlocks) {
Set<Gene> blockGenes = getAlignmentBlockonGeneExon(refName, b, genes);
result.addAll(blockGenes);
/*
// if result is not empty and blockGenes isn't empty, intersect the set and set the result as this new set.
if (result.size()>0 && blockGenes.size()>0) {
if (allowMultiGeneReads) result.addAll(blockGenes);
if (!allowMultiGeneReads) result.retainAll(blockGenes);
} else
// if blockGenes is populated and you're here, then result is empty, so set result to these results
result=blockGenes;
*/
}
if (!allowMultiGeneReads & result.size()>1) return new HashSet<>();
return result;
}
示例3
private SAMRecord tagRead(final SAMRecord r, final OverlapDetector<Interval> od) {
Set<Interval>intervals = new HashSet<>();
List<AlignmentBlock> blocks = r.getAlignmentBlocks();
for (AlignmentBlock b: blocks) {
int refStart =b.getReferenceStart();
int refEnd = refStart+b.getLength()-1;
Interval v = new Interval(r.getReferenceName(), refStart, refEnd);
Collection<Interval> blockResult = od.getOverlaps(v);
intervals.addAll(blockResult);
}
String tagName = getIntervalName(intervals);
if (tagName!=null)
r.setAttribute(this.TAG, tagName);
else
r.setAttribute(this.TAG, null);
return (r);
}
示例4
/**
* Check if a read overlaps any SNPs in the OverlapDetector. Tag reads with SNPs.
* If more than 1 SNP tags a read, make a read for each SNP.
* Simplified since data goes through GeneFunctionIteratorWrapper to take care of how reads/genes interact.
*/
private void processSNP (final SAMRecord r) {
List<AlignmentBlock> blocks = r.getAlignmentBlocks();
Collection<String> snps = new HashSet<>();
for (AlignmentBlock b: blocks) {
int start = b.getReferenceStart();
int end = start + b.getLength() -1;
Interval i = new Interval(r.getReferenceName(), start, end);
Collection<Interval> overlaps = this.snpIntervals.getOverlaps(i);
for (Interval o: overlaps)
snps.add(IntervalTagComparator.toString(o));
}
// 1 read per SNP.
for (String snp:snps) {
SAMRecord rr = Utils.getClone(r);
rr.setAttribute(this.snpTag, snp);
queueRecordForOutput(rr);
}
}
示例5
/**
* For a read, split into the alignment blocks.
* For each alignment block, determine the genes the block overlaps, and the locus function of those genes.
*
* Each alignment block can generate a different functional annotation on the same gene. These should be retained.
* For example, block one can align to an exon, and block two to an intron, and both those annotations are retained.
*
* Only retain genes where alignment blocks all reference that gene. If block one refers to genes A,B and block two to gene A only, then only retain gene A.
*
*/
public Map<Gene, List<LocusFunction>> getFunctionalDataForRead (final SAMRecord rec, final OverlapDetector<Gene> geneOverlapDetector) {
List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();
Map<AlignmentBlock, Map<Gene, List<LocusFunction>>> map = new HashMap<>();
// gather the locus functions for each alignment block.
for (AlignmentBlock block: alignmentBlocks) {
Interval interval = getInterval(rec.getReferenceName(), block);
Map<Gene, List<LocusFunction>> locusFunctionsForGeneMap = getFunctionalDataForInterval(interval, geneOverlapDetector);
map.put(block, locusFunctionsForGeneMap);
}
// simplify genes by only using genes that are common to all alignment blocks.
Map<Gene, List<LocusFunction>> result = simplifyFunctionalDataAcrossAlignmentBlocks(map);
return result;
}
示例6
private Set<Gene> getAlignmentBlockonGeneExon(final String refName, final AlignmentBlock b, final Set<Gene> genes) {
Set<Gene> result = new HashSet<>();
for (Gene g: genes)
if (getAlignmentBlockOverlapsExon(refName, b, g))
result.add(g);
return (result);
}
示例7
private boolean getAlignmentBlockOverlapsExon (final String refName, final AlignmentBlock b, final Gene g) {
Interval ib = getInterval(refName, b);
for (Transcript t: g)
for (Exon e: t.exons) {
Interval ei = getInterval(refName, e);
if (ib.intersects(ei))
return true;
}
return false;
}
示例8
public LocusFunction [] getLocusFunctionsByBlock (final AlignmentBlock b, final Collection<Gene> overlappingGenes) {
// Get functional class for each position in the alignment block.
final LocusFunction[] locusFunctions = new LocusFunction[b.getLength()];
// By default, if base does not overlap with rRNA or gene, it is intergenic.
Arrays.fill(locusFunctions, 0, locusFunctions.length, LocusFunction.INTERGENIC);
for (final Gene gene : overlappingGenes)
for (final Gene.Transcript transcript : gene)
transcript.assignLocusFunctionForRange(b.getReferenceStart(), locusFunctions);
return locusFunctions;
}
示例9
/**
* Returns 1-based index of first base in read that corresponds to M in CIGAR string.
* Note that first is relative to 5' end, so that for reverse-strand alignment, the index of
* the last base aligned is computed relative to the end of the read.
*/
int getIndexOfFirstAlignedBase(final SAMRecord rec) {
final List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();
if (rec.getReadNegativeStrandFlag()) {
final AlignmentBlock alignmentBlock = alignmentBlocks.get(alignmentBlocks.size() - 1);
return rec.getReadLength() - CoordMath.getEnd(alignmentBlock.getReadStart(), alignmentBlock.getLength()) + 1;
} else {
return alignmentBlocks.get(0).getReadStart();
}
}
示例10
/** Get the the number of bases in the given alignment block and record that have base quality greater or equal to the minimum */
public static int getNumBasesPassingMinimumBaseQuality(final SAMRecord record, final AlignmentBlock block, final int minimumBaseQuality) {
int basesInBlockAtMinimumQuality = 0;
final byte[] baseQualities = record.getBaseQualities();
for (int idx = block.getReadStart(); idx <= CoordMath.getEnd(block.getLength(), block.getReadStart()); idx++) { // idx is one-based
if (minimumBaseQuality <= baseQualities[idx-1]) basesInBlockAtMinimumQuality++;
}
return basesInBlockAtMinimumQuality;
}
示例11
@Override
public final boolean filterOut(final SAMRecord record) {
final boolean filteredOut = reallyFilterOut(record);
if (filteredOut) {
++filteredRecords;
for (final AlignmentBlock block : record.getAlignmentBlocks()) {
this.filteredBases += block.getLength();
}
}
return filteredOut;
}
示例12
/**
* Returns 1-based index of first base in read that corresponds to M in CIGAR string.
* Note that first is relative to 5' end, so that for reverse-strand alignment, the index of
* the last base aligned is computed relative to the end of the read.
*/
int getIndexOfFirstAlignedBase(final SAMRecord rec) {
final List<AlignmentBlock> alignmentBlocks = rec.getAlignmentBlocks();
if (rec.getReadNegativeStrandFlag()) {
final AlignmentBlock alignmentBlock = alignmentBlocks.get(alignmentBlocks.size() - 1);
return rec.getReadLength() - CoordMath.getEnd(alignmentBlock.getReadStart(), alignmentBlock.getLength()) + 1;
} else {
return alignmentBlocks.get(0).getReadStart();
}
}
示例13
private Interval getInterval (final String refName, final AlignmentBlock b) {
int s = b.getReferenceStart();
int e = s + b.getLength()-1;
Interval i = new Interval(refName, s,e);
return (i);
}
示例14
/**
* For a read on this pileup, get the base and quality of the base that is at the same
* position as the SNP. If the read does not overlap the interval, then return an empty array.
* @param r The read
* @return A length 2 byte array containing the base and quality. Empty if the read does not overlap the interval.
*/
public byte [] getBaseAndQualityOverlappingInterval (final SAMRecord r) {
byte [] result = new byte [2];
List<CigarElement> elements = r.getCigar().getCigarElements();
Iterator<AlignmentBlock> blocks = r.getAlignmentBlocks().iterator();
int finalSNPLocalPosition=-1;
int lengthTraversed=0;
for (CigarElement ce: elements) {
CigarOperator co = ce.getOperator();
// you're in an alignment block
if (co==CigarOperator.M) {
// get the next alignment block
AlignmentBlock b = blocks.next();
int refStart = b.getReferenceStart();
int snpLocalPos=this.getPosition() - refStart +1;
int blockLength=b.getLength();
// is the local position inside this alignment block?
// if not, move onto the next block.
if (snpLocalPos >0 && snpLocalPos<=blockLength) {
// found it! Done.
finalSNPLocalPosition=snpLocalPos+lengthTraversed;
break;
}
}
// consume the bases if necessary and move on to the next element.
if (co.consumesReadBases())
lengthTraversed+=ce.getLength();
}
// if the position is assigned, then add to the pileup.
if (finalSNPLocalPosition!=-1) {
// arrays 0 based.
byte [] quals = r.getBaseQualities();
byte [] bases = r.getReadBases();
byte base = bases[finalSNPLocalPosition-1];
// char baseChar = StringUtil.byteToChar(base);
byte qual = quals[finalSNPLocalPosition-1];
result[0]=base;
result[1]=qual;
return (result);
}
return (ArrayUtils.EMPTY_BYTE_ARRAY);
}
示例15
@Override
protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) {
// see if the whole read should be skipped
if (recordFilter.filterOut(rec)) return;
// check read group + library
final String library = (rec.getReadGroup() == null) ? UNKNOWN_LIBRARY : getOrElse(rec.getReadGroup().getLibrary(), UNKNOWN_LIBRARY);
if (!libraries.contains(library)) {
// should never happen if SAM is valid
throw new PicardException("Record contains library that is missing from header: " + library);
}
// set up some constants that don't change in the loop below
final int contextFullLength = 2 * CONTEXT_SIZE + 1;
final ArtifactCounter counter = artifactCounters.get(library);
final byte[] readBases = rec.getReadBases();
final byte[] readQuals;
if (USE_OQ) {
final byte[] tmp = rec.getOriginalBaseQualities();
readQuals = tmp == null ? rec.getBaseQualities() : tmp;
} else {
readQuals = rec.getBaseQualities();
}
// iterate over aligned positions
for (final AlignmentBlock block : rec.getAlignmentBlocks()) {
for (int offset = 0; offset < block.getLength(); offset++) {
// remember, these are 1-based!
final int readPos = block.getReadStart() + offset;
final int refPos = block.getReferenceStart() + offset;
// skip low BQ sites
final byte qual = readQuals[readPos - 1];
if (qual < MINIMUM_QUALITY_SCORE) continue;
// skip N bases in read
final char readBase = Character.toUpperCase((char)readBases[readPos - 1]);
if (readBase == 'N') continue;
/**
* Skip regions outside of intervals.
*
* NB: IntervalListReferenceSequenceMask.get() has side-effects which assume
* that successive ReferenceSequence's passed to this method will be in-order
* (e.g. it will break if you call acceptRead() with chr1, then chr2, then chr1
* again). So this only works if the underlying iteration is coordinate-sorted.
*/
if (intervalMask != null && !intervalMask.get(ref.getContigIndex(), refPos)) continue;
// skip dbSNP sites
if (dbSnpMask != null && dbSnpMask.isDbSnpSite(ref.getName(), refPos)) continue;
// skip the ends of the reference
final int contextStartIndex = refPos - CONTEXT_SIZE - 1;
if (contextStartIndex < 0 || contextStartIndex + contextFullLength > ref.length()) continue;
// skip contexts with N bases
final String context = getRefContext(ref, contextStartIndex, contextFullLength);
if (context.contains("N")) continue;
// skip non-ACGT bases
if (!SequenceUtil.isUpperACGTN((byte)readBase))
continue;
// count the base!
counter.countRecord(context, readBase, rec);
}
}
}
示例16
public void acceptRecord(final SAMRecordAndReference args) {
mappedRecordCount++;
final SAMRecord samRecord = args.getSamRecord();
final ReferenceSequence referenceSequence = args.getReferenceSequence();
final byte[] readBases = samRecord.getReadBases();
final byte[] readQualities = samRecord.getBaseQualities();
final byte[] refBases = referenceSequence.getBases();
if (samRecord.getReadLength() < minReadLength) {
smallReadCount++;
return;
} else if (SequenceUtil.countMismatches(samRecord, refBases, true) > Math.round(samRecord.getReadLength() * maxMismatchRate)) {
mismatchCount++;
return;
}
// We only record non-CpG C sites if there was at least one CpG in the read, keep track of
// the values for this record and then apply to the global totals if valid
int recordCpgs = 0;
for (final AlignmentBlock alignmentBlock : samRecord.getAlignmentBlocks()) {
final int blockLength = alignmentBlock.getLength();
final int refFragmentStart = alignmentBlock.getReferenceStart() - 1;
final int readFragmentStart = alignmentBlock.getReadStart() - 1;
final byte[] refFragment = getFragment(refBases, refFragmentStart, blockLength);
final byte[] readFragment = getFragment(readBases, readFragmentStart, blockLength);
final byte[] readQualityFragment = getFragment(readQualities, readFragmentStart, blockLength);
if (samRecord.getReadNegativeStrandFlag()) {
// In the case of a negative strand, reverse (and complement for base arrays) the reference,
// reads & qualities so that it can be treated as a positive strand for the rest of the process
SequenceUtil.reverseComplement(refFragment);
SequenceUtil.reverseComplement(readFragment);
SequenceUtil.reverseQualities(readQualityFragment);
}
for (int i=0; i < blockLength-1; i++) {
final int curRefIndex = getCurRefIndex(refFragmentStart, blockLength, i, samRecord.getReadNegativeStrandFlag());
// Look at a 2-base window to see if we're on a CpG site, and if so check for conversion
// (CG -> TG). We do not consider ourselves to be on a CpG site if we're on the last base of a read
if ((SequenceUtil.basesEqual(refFragment[i], SequenceUtil.C)) &&
(SequenceUtil.basesEqual(refFragment[i+1], SequenceUtil.G))) {
// We want to catch the case where there's a CpG in the reference, even if it is not valid
// to prevent the C showing up as a non-CpG C down below. Otherwise this could have been all
// in one if statement
if (isValidCpg(refFragment, readFragment, readQualityFragment, i)) {
recordCpgs++;
final CpgLocation curLocation = new CpgLocation(samRecord.getReferenceName(), curRefIndex);
cpgTotal.increment(curLocation);
if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) {
cpgConverted.increment(curLocation);
}
}
i++;
} else if (isC(refFragment[i], readFragment[i]) && isAboveCytoQcThreshold(readQualities, i) &&
SequenceUtil.bisulfiteBasesEqual(false, readFragment[i+1], refFragment[i+1])) {
// C base in the reference that's not associated with a CpG
nCytoTotal++;
if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) {
nCytoConverted++;
}
}
}
}
if (recordCpgs == 0) {
noCpgCount++;
}
}