Java源码示例:org.apache.lucene.util.automaton.CompiledAutomaton

示例1
public void testMultiTerm() throws IOException {
  RegExp re = new RegExp("p.*e");
  IntervalsSource source = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), re.toString());

  checkIntervals(source, "field1", 5, new int[][]{
      {},
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 7, 7 },
      { 0, 0, 1, 1, 3, 3, 4, 4, 6, 6, 7, 7 },
      { 0, 0 }
  });

  IllegalStateException e = expectThrows(IllegalStateException.class, () -> {
    IntervalsSource s = Intervals.multiterm(new CompiledAutomaton(re.toAutomaton()), 1, re.toString());
    for (LeafReaderContext ctx : searcher.getIndexReader().leaves()) {
      s.intervals("field1", ctx);
    }
  });
  assertEquals("Automaton [\\p(.)*\\e] expanded to too many terms (limit 1)", e.getMessage());

  checkVisits(source, 1);
}
 
示例2
/**
 * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
 * The enum supports {@link TermsEnum#ord()}.
 */
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
  TermsEnum in = termsEnum();
  switch (automaton.type) {
    case NONE:
      return TermsEnum.EMPTY;
    case ALL:
      return in;
    case SINGLE:
      return new SingleTermsEnum(in, automaton.term);
    case NORMAL:
      return new AutomatonTermsEnum(in, automaton);
    default:
      // unreachable
      throw new RuntimeException("unhandled case");
  }
}
 
示例3
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  final List<MultiTermsEnum.TermsEnumIndex> termsEnums = new ArrayList<>();
  for(int i=0;i<subs.length;i++) {
    final TermsEnum termsEnum = subs[i].intersect(compiled, startTerm);
    if (termsEnum != null) {
      termsEnums.add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i));
    }
  }

  if (termsEnums.size() > 0) {
    return new MultiTermsEnum(subSlices).reset(termsEnums.toArray(MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY));
  } else {
    return TermsEnum.EMPTY;
  }
}
 
示例4
/**
 * Returns a {@link TermsEnum} over the values, filtered by a {@link CompiledAutomaton}
 * The enum supports {@link TermsEnum#ord()}.
 */
public TermsEnum intersect(CompiledAutomaton automaton) throws IOException {
  TermsEnum in = termsEnum();
  switch (automaton.type) {
    case NONE:
      return TermsEnum.EMPTY;
    case ALL:
      return in;
    case SINGLE:
      return new SingleTermsEnum(in, automaton.term);
    case NORMAL:
      return new AutomatonTermsEnum(in, automaton);
    default:
      // unreachable
      throw new RuntimeException("unhandled case");
  }
}
 
示例5
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectTermsEnum(compiled, startTerm);
}
 
示例6
IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  super();
  //if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
  this.fst = dict;
  this.fstReader = fst.getBytesReader();
  this.fstOutputs = dict.outputs;
  this.fsa = compiled.runAutomaton;
  this.level = -1;
  this.stack = new Frame[16];
  for (int i = 0 ; i < stack.length; i++) {
    this.stack[i] = new Frame();
  }

  loadVirtualFrame(newFrame());
  this.level++;
  pushFrame(loadFirstFrame(newFrame()));

  this.meta = null;
  this.metaUpto = 1;
  this.decoded = false;
  this.pending = false;

  if (startTerm == null) {
    pending = isAccept(topFrame());
  } else {
    doSeekCeil(startTerm);
    pending = (term == null || !startTerm.equals(term.get())) && isValid(topFrame()) && isAccept(topFrame());
  }
}
 
示例7
@Override
public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new DirectIntersectTermsEnum(compiled, startTerm);
}
 
示例8
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new OrdsIntersectTermsEnum(this, compiled, startTerm);
}
 
示例9
protected IntersectBlockReader(CompiledAutomaton compiled, BytesRef startTerm,
                               IndexDictionary.BrowserSupplier dictionaryBrowserSupplier, IndexInput blockInput,
                               PostingsReaderBase postingsReader, FieldMetadata fieldMetadata,
                               BlockDecoder blockDecoder) throws IOException {
  super(dictionaryBrowserSupplier, blockInput, postingsReader, fieldMetadata, blockDecoder);
  automaton = compiled.automaton;
  runAutomaton = compiled.runAutomaton;
  finite = compiled.finite;
  commonSuffix = compiled.commonSuffixRef;
  minTermLength = getMinTermLength();
  nextStringCalculator = new AutomatonNextTermCalculator(compiled);
  seekTerm = startTerm;
}
 
示例10
MultiTermIntervalsSource(CompiledAutomaton automaton, int maxExpansions, String pattern) {
  this.automaton = automaton;
  if (maxExpansions > IndexSearcher.getMaxClauseCount()) {
    throw new IllegalArgumentException("maxExpansions [" + maxExpansions
        + "] cannot be greater than BooleanQuery.getMaxClauseCount [" + IndexSearcher.getMaxClauseCount() + "]");
  }
  this.maxExpansions = maxExpansions;
  this.pattern = pattern;
}
 
示例11
@Override
public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws IOException {
  TermsEnum termsEnum = in.intersect(automaton, bytes);
  assert termsEnum != null;
  assert bytes == null || bytes.isValid();
  return new AssertingTermsEnum(termsEnum, hasFreqs());
}
 
示例12
/** 
 * Terms api equivalency 
 */
public void assertTermsEquals(String info, IndexReader leftReader, Terms leftTerms, Terms rightTerms, boolean deep) throws IOException {
  if (leftTerms == null || rightTerms == null) {
    assertNull(info, leftTerms);
    assertNull(info, rightTerms);
    return;
  }
  assertTermsStatisticsEquals(info, leftTerms, rightTerms);
  assertEquals("hasOffsets", leftTerms.hasOffsets(), rightTerms.hasOffsets());
  assertEquals("hasPositions", leftTerms.hasPositions(), rightTerms.hasPositions());
  assertEquals("hasPayloads", leftTerms.hasPayloads(), rightTerms.hasPayloads());

  TermsEnum leftTermsEnum = leftTerms.iterator();
  TermsEnum rightTermsEnum = rightTerms.iterator();
  assertTermsEnumEquals(info, leftReader, leftTermsEnum, rightTermsEnum, true);
  
  assertTermsSeekingEquals(info, leftTerms, rightTerms);
  
  if (deep) {
    int numIntersections = atLeast(3);
    for (int i = 0; i < numIntersections; i++) {
      String re = AutomatonTestUtil.randomRegexp(random());
      CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
      if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
        // TODO: test start term too
        TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
        TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
        assertTermsEnumEquals(info, leftReader, leftIntersection, rightIntersection, rarely());
      }
    }
  }
}
 
示例13
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  // if (DEBUG) System.out.println("  FieldReader.intersect startTerm=" + BlockTreeTermsWriter.brToString(startTerm));
  //System.out.println("intersect: " + compiled.type + " a=" + compiled.automaton);
  // TODO: we could push "it's a range" or "it's a prefix" down into IntersectTermsEnum?
  // can we optimize knowing that...?
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectTermsEnum(this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm);
}
 
示例14
CompiledAutomaton[] buildAutomatonSet() {
  CompiledAutomaton[] compiled = new CompiledAutomaton[maxEdits + 1];
  for (int i = 0; i <= maxEdits; i++) {
    try {
      compiled[i] = new CompiledAutomaton(levBuilder.toAutomaton(i, prefix), true, false);
    }
    catch (TooComplexToDeterminizeException e) {
      throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
    }
  }
  return compiled;
}
 
示例15
CompiledAutomaton buildMaxEditAutomaton() {
  try {
    return new CompiledAutomaton(levBuilder.toAutomaton(maxEdits, prefix), true, false);
  } catch (TooComplexToDeterminizeException e) {
    throw new FuzzyTermsEnum.FuzzyTermsException(term, e);
  }
}
 
示例16
private ByteRunAutomaton asByteRunAutomaton() {
  TermIterator iterator = termData.iterator();
  List<Automaton> automata = new ArrayList<>();
  for (BytesRef term = iterator.next(); term != null; term = iterator.next()) {
    automata.add(Automata.makeBinary(term));
  }
  return new CompiledAutomaton(Operations.union(automata)).runAutomaton;

}
 
示例17
/**
 * Construct an enumerator based upon an automaton, enumerating the specified
 * field, working on a supplied TermsEnum
 *
 * @lucene.experimental 
 * @param compiled CompiledAutomaton
 */
public AutomatonTermsEnum(TermsEnum tenum, CompiledAutomaton compiled) {
  super(tenum);
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  this.finite = compiled.finite;
  this.runAutomaton = compiled.runAutomaton;
  assert this.runAutomaton != null;
  this.commonSuffixRef = compiled.commonSuffixRef;
  this.automaton = compiled.automaton;

  // No need to track visited states for a finite language without loops.
  visited = finite ? null : new short[runAutomaton.getSize()];
}
 
示例18
/** Returns a TermsEnum that iterates over all terms and
 *  documents that are accepted by the provided {@link
 *  CompiledAutomaton}.  If the <code>startTerm</code> is
 *  provided then the returned enum will only return terms
 *  {@code > startTerm}, but you still must call
 *  next() first to get to the first term.  Note that the
 *  provided <code>startTerm</code> must be accepted by
 *  the automaton.
 *
 *  <p>This is an expert low-level API and will only work
 *  for {@code NORMAL} compiled automata.  To handle any
 *  compiled automata you should instead use
 *  {@link CompiledAutomaton#getTermsEnum} instead.
 *
 *  <p><b>NOTE</b>: the returned TermsEnum cannot seek</p>.
 */
public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) throws IOException {
  
  // TODO: could we factor out a common interface b/w
  // CompiledAutomaton and FST?  Then we could pass FST there too,
  // and likely speed up resolving terms to deleted docs ... but
  // AutomatonTermsEnum makes this tricky because of its on-the-fly cycle
  // detection
  
  // TODO: eventually we could support seekCeil/Exact on
  // the returned enum, instead of only being able to seek
  // at the start

  TermsEnum termsEnum = iterator();

  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }

  if (startTerm == null) {
    return new AutomatonTermsEnum(termsEnum, compiled);
  } else {
    return new AutomatonTermsEnum(termsEnum, compiled) {
      @Override
      protected BytesRef nextSeekTerm(BytesRef term) throws IOException {
        if (term == null) {
          term = startTerm;
        }
        return super.nextSeekTerm(term);
      }
    };
  }
}
 
示例19
private boolean accepts(CompiledAutomaton c, BytesRef b) {
  int state = 0;
  for(int idx=0;idx<b.length;idx++) {
    assertTrue(state != -1);
    state = c.runAutomaton.step(state, b.bytes[b.offset+idx] & 0xff);
  }
  return c.runAutomaton.isAccept(state);
}
 
示例20
public void testIntersectRegexp() throws Exception {
  Directory d = newDirectory();
  RandomIndexWriter w = new RandomIndexWriter(random(), d);
  Document doc = new Document();
  doc.add(newStringField("field", "foobar", Field.Store.NO));
  w.addDocument(doc);
  IndexReader r = w.getReader();
  Terms terms = MultiTerms.getTerms(r, "field");
  CompiledAutomaton automaton = new CompiledAutomaton(new RegExp("do_not_match_anything").toAutomaton());
  String message = expectThrows(IllegalArgumentException.class, () -> {terms.intersect(automaton, null);}).getMessage();
  assertEquals("please use CompiledAutomaton.getTermsEnum instead", message);
  r.close();
  w.close();
  d.close();
}
 
示例21
public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception {
  if (leftTerms == null || rightTerms == null) {
    assertNull(leftTerms);
    assertNull(rightTerms);
    return;
  }
  assertTermsStatistics(leftTerms, rightTerms);
  
  // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be different

  boolean bothHaveFreqs = leftTerms.hasFreqs() && rightTerms.hasFreqs();
  boolean bothHavePositions = leftTerms.hasPositions() && rightTerms.hasPositions();
  TermsEnum leftTermsEnum = leftTerms.iterator();
  TermsEnum rightTermsEnum = rightTerms.iterator();
  assertTermsEnum(leftTermsEnum, rightTermsEnum, true, bothHaveFreqs, bothHavePositions);
  
  assertTermsSeeking(leftTerms, rightTerms);
  
  if (deep) {
    int numIntersections = atLeast(3);
    for (int i = 0; i < numIntersections; i++) {
      String re = AutomatonTestUtil.randomRegexp(random());
      CompiledAutomaton automaton = new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
      if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
        // TODO: test start term too
        TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
        TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
        assertTermsEnum(leftIntersection, rightIntersection, rarely(), bothHaveFreqs, bothHavePositions);
      }
    }
  }
}
 
示例22
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectEnum(compiled, startTerm);
}
 
示例23
@Override
public TermsEnum intersect(CompiledAutomaton automaton, BytesRef bytes) throws IOException {
    TermsEnum termsEnum = in.intersect(automaton, bytes);
    assert termsEnum != null;
    assert bytes == null || bytes.isValid();
    return new AssertingTermsEnum(termsEnum, hasFreqs());
}
 
示例24
private AutomatonBackedOrdinalsFilter(Automaton automaton) {
    this.compiled = new CompiledAutomaton(automaton);
}
 
示例25
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  return new TermVectorFilteredTermsEnum(in.iterator(), filterTerms.intersect(compiled, startTerm));
}
 
示例26
public OrdsIntersectTermsEnum(OrdsFieldReader fr, CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  // if (DEBUG) {
  //   System.out.println("\nintEnum.init seg=" + segment + " commonSuffix=" + brToString(compiled.commonSuffixRef));
  // }
  this.fr = fr;
  runAutomaton = compiled.runAutomaton;
  compiledAutomaton = compiled;
  in = fr.parent.in.clone();
  stack = new OrdsIntersectTermsEnumFrame[5];
  for(int idx=0;idx<stack.length;idx++) {
    stack[idx] = new OrdsIntersectTermsEnumFrame(this, idx);
  }
  for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) {
    arcs[arcIdx] = new FST.Arc<>();
  }

  if (fr.index == null) {
    fstReader = null;
  } else {
    fstReader = fr.index.getBytesReader();
  }

  // TODO: if the automaton is "smallish" we really
  // should use the terms index to seek at least to
  // the initial term and likely to subsequent terms
  // (or, maybe just fallback to ATE for such cases).
  // Else the seek cost of loading the frames will be
  // too costly.

  final FST.Arc<Output> arc = fr.index.getFirstArc(arcs[0]);
  // Empty string prefix must have an output in the index!
  assert arc.isFinal();

  // Special pushFrame since it's the first one:
  final OrdsIntersectTermsEnumFrame f = stack[0];
  f.fp = f.fpOrig = fr.rootBlockFP;
  f.prefix = 0;
  f.setState(0);
  f.arc = arc;
  f.outputPrefix = arc.output();
  f.load(fr.rootCode);

  // for assert:
  assert setSavedStartTerm(startTerm);

  currentFrame = f;
  if (startTerm != null) {
    seekToStartTerm(startTerm);
  }
}
 
示例27
@Override
public TermsEnum intersect(CompiledAutomaton compiled,
    final BytesRef startTerm) throws IOException {
  return delegateTerms.intersect(compiled, startTerm);
}
 
示例28
protected AutomatonNextTermCalculator(CompiledAutomaton compiled) {
  visited = compiled.finite ? null : new short[runAutomaton.getSize()];
}
 
示例29
@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  checkIntersectAutomatonType(compiled);
  return new IntersectBlockReader(compiled, startTerm, dictionaryBrowserSupplier, blockInput, postingsReader, fieldMetadata, blockDecoder);
}
 
示例30
protected void checkIntersectAutomatonType(CompiledAutomaton automaton) {
  // This check is consistent with other impls and precondition stated in javadoc.
  if (automaton.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
}