Java源码示例:org.apache.lucene.analysis.custom.CustomAnalyzer
示例1
/**
* Create a new empty search index.
*/
public SongSearchIndex() {
songs = new HashMap<>();
try {
analyzer = CustomAnalyzer.builder()
.withTokenizer(StandardTokenizerFactory.class)
.addTokenFilter(LowerCaseFilterFactory.class)
.addTokenFilter(ASCIIFoldingFilterFactory.class)
.build();
index = new MMapDirectory(Files.createTempDirectory("quelea-mmap-song").toAbsolutePath());
}
catch(IOException ex) {
LOGGER.log(Level.SEVERE, "Couldn't create song search index");
throw new RuntimeException("Couldn't create song search index", ex);
}
}
示例2
@Test
public void testAnalyze_custom() {
AnalysisImpl analysis = new AnalysisImpl();
Map<String, String> tkParams = new HashMap<>();
tkParams.put("maxTokenLen", "128");
CustomAnalyzerConfig.Builder builder = new CustomAnalyzerConfig.Builder(
"keyword", tkParams)
.addTokenFilterConfig("lowercase", Collections.emptyMap());
CustomAnalyzer analyzer = (CustomAnalyzer) analysis.buildCustomAnalyzer(builder.build());
assertEquals("org.apache.lucene.analysis.custom.CustomAnalyzer", analyzer.getClass().getName());
assertEquals("org.apache.lucene.analysis.core.KeywordTokenizerFactory", analyzer.getTokenizerFactory().getClass().getName());
assertEquals("org.apache.lucene.analysis.core.LowerCaseFilterFactory", analyzer.getTokenFilterFactories().get(0).getClass().getName());
String text = "Apache Lucene";
List<Analysis.Token> tokens = analysis.analyze(text);
assertNotNull(tokens);
}
示例3
public void testPOS() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
SENTENCES_posTags, null, null, true);
analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
null, null, null, true, toPayloads(SENTENCES_posTags));
}
示例4
@Override
public void setArgs(IndexSchema schema, Map<String, String> args) {
args.putIfAbsent("stored", "false");
args.putIfAbsent("omitTermFreqAndPositions", "true");
args.putIfAbsent("omitNorms", "true");
args.putIfAbsent("maxCharsForDocValues", "-1");
super.setArgs(schema, args);
// CustomAnalyzer is easy to use
CustomAnalyzer customAnalyzer;
try {
customAnalyzer = CustomAnalyzer.builder(schema.getResourceLoader())
.withDefaultMatchVersion(schema.getDefaultLuceneMatchVersion())
.withTokenizer(KeywordTokenizerFactory.class)
.addTokenFilter(PatternReplaceFilterFactory.class,
"pattern", "#\\d*",
"replace", "all")
.build();
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);//impossible?
}
// Solr HTTP Schema APIs don't know about CustomAnalyzer so use TokenizerChain instead
setIndexAnalyzer(new TokenizerChain(customAnalyzer));
// leave queryAnalyzer as literal
}
示例5
public static void main(String[] args) throws IOException {
Analyzer az = CustomAnalyzer.builder()
//.withTokenizer("Standard")
.withTokenizer("Name")
.addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20")
//.addTokenFilter("ICUTransform", "id", "Han-Latin;NFD;[[:NonspacingMark:][:Space:]] Remove")
//.addTokenFilter("EdgeNGram", "minGramSize", "1", "maxGramSize", "20")
.build();
StringReader sr = new StringReader(args[0]);
TokenStream ts = az.tokenStream ("" , sr);
OffsetAttribute oa = ts.addAttribute (OffsetAttribute.class);
CharTermAttribute ta = ts.addAttribute (CharTermAttribute.class);
try {
ts.reset(); // Resets this stream to the beginning. (Required)
while (ts.incrementToken()) {
System.out.println(ta.toString() + "|" + ta.length()
+ "[" + oa.startOffset() + "," + oa.endOffset() + "]");
}
ts.end( ); // Perform end-of-stream operations, e.g. set the final offset.
} finally {
ts.close(); // Release resources associated with this stream.
}
}
示例6
private Builder createDefaultAnalyzerBuilder() throws IOException {
Builder builder = CustomAnalyzer.builder()
.withTokenizer(StandardTokenizerFactory.class)
.addTokenFilter(CJKWidthFilterFactory.class)
.addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
.addTokenFilter(LowerCaseFilterFactory.class)
.addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS)
.addTokenFilter(EnglishPossessiveFilterFactory.class);
addTokenFilterForUnderscoreRemovalAroundToken(builder);
return builder;
}
示例7
private Builder createArtistAnalyzerBuilder() throws IOException {
Builder builder = CustomAnalyzer.builder()
.withTokenizer(StandardTokenizerFactory.class)
.addTokenFilter(CJKWidthFilterFactory.class)
.addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
.addTokenFilter(LowerCaseFilterFactory.class)
.addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS_ARTIST)
.addTokenFilter(EnglishPossessiveFilterFactory.class);
addTokenFilterForUnderscoreRemovalAroundToken(builder);
return builder;
}
示例8
/**
* Create a new empty search index.
*/
public BibleSearchIndex() {
chapters = new HashMap<>();
try {
analyzer = CustomAnalyzer.builder()
.withTokenizer(StandardTokenizerFactory.class)
.addTokenFilter(LowerCaseFilterFactory.class)
.addTokenFilter(ASCIIFoldingFilterFactory.class)
.build();
index = new MMapDirectory(Files.createTempDirectory("quelea-mmap-bible").toAbsolutePath());
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, "Couldn't create song search index");
throw new RuntimeException("Couldn't create song search index", ex);
}
}
示例9
private Builder createDefaultAnalyzerBuilder() throws IOException {
Builder builder = CustomAnalyzer.builder()
.withTokenizer(StandardTokenizerFactory.class)
.addTokenFilter(CJKWidthFilterFactory.class)
.addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
.addTokenFilter(LowerCaseFilterFactory.class)
.addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS)
.addTokenFilter(EnglishPossessiveFilterFactory.class);
addTokenFilterForUnderscoreRemovalAroundToken(builder);
return builder;
}
示例10
private Builder createArtistAnalyzerBuilder() throws IOException {
Builder builder = CustomAnalyzer.builder()
.withTokenizer(StandardTokenizerFactory.class)
.addTokenFilter(CJKWidthFilterFactory.class)
.addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "false")
.addTokenFilter(LowerCaseFilterFactory.class)
.addTokenFilter(StopFilterFactory.class, "words", STOP_WORDS_ARTIST)
.addTokenFilter(EnglishPossessiveFilterFactory.class);
addTokenFilterForUnderscoreRemovalAroundToken(builder);
return builder;
}
示例11
@Override
public void setAnalyzer(Analyzer analyzer) {
analyzerNameLbl.setText(analyzer.getClass().getName());
if (analyzer instanceof CustomAnalyzer) {
CustomAnalyzer customAnalyzer = (CustomAnalyzer) analyzer;
DefaultListModel<String> charFilterListModel = new DefaultListModel<>();
customAnalyzer.getCharFilterFactories().stream()
.map(f -> f.getClass().getSimpleName())
.forEach(charFilterListModel::addElement);
charFilterList.setModel(charFilterListModel);
tokenizerTF.setText(customAnalyzer.getTokenizerFactory().getClass().getSimpleName());
DefaultListModel<String> tokenFilterListModel = new DefaultListModel<>();
customAnalyzer.getTokenFilterFactories().stream()
.map(f -> f.getClass().getSimpleName())
.forEach(tokenFilterListModel::addElement);
tokenFilterList.setModel(tokenFilterListModel);
charFilterList.setBackground(Color.white);
tokenizerTF.setBackground(Color.white);
tokenFilterList.setBackground(Color.white);
} else {
charFilterList.setModel(new DefaultListModel<>());
tokenizerTF.setText("");
tokenFilterList.setModel(new DefaultListModel<>());
charFilterList.setBackground(Color.lightGray);
tokenizerTF.setBackground(Color.lightGray);
tokenFilterList.setBackground(Color.lightGray);
}
}
示例12
void showAnalysisChainDialog() {
if (getCurrentAnalyzer() instanceof CustomAnalyzer) {
CustomAnalyzer analyzer = (CustomAnalyzer) getCurrentAnalyzer();
new DialogOpener<>(analysisChainDialogFactory).open("Analysis chain", 600, 320,
(factory) -> {
factory.setAnalyzer(analyzer);
});
}
}
示例13
@Override
public Analyzer buildCustomAnalyzer(CustomAnalyzerConfig config) {
Objects.requireNonNull(config);
try {
// create builder
CustomAnalyzer.Builder builder = config.getConfigDir()
.map(path -> CustomAnalyzer.builder(FileSystems.getDefault().getPath(path)))
.orElse(CustomAnalyzer.builder());
// set tokenizer
builder.withTokenizer(config.getTokenizerConfig().getName(), config.getTokenizerConfig().getParams());
// add char filters
for (CustomAnalyzerConfig.ComponentConfig cfConf : config.getCharFilterConfigs()) {
builder.addCharFilter(cfConf.getName(), cfConf.getParams());
}
// add token filters
for (CustomAnalyzerConfig.ComponentConfig tfConf : config.getTokenFilterConfigs()) {
builder.addTokenFilter(tfConf.getName(), tfConf.getParams());
}
// build analyzer
this.analyzer = builder.build();
return analyzer;
} catch (Exception e) {
throw new LukeException("Failed to build custom analyzer.", e);
}
}
示例14
@Test
public void testAnalyzer_custom_with_confdir() throws Exception {
Path confDir = createTempDir("conf");
Path stopFile = Files.createFile(Paths.get(confDir.toString(), "stop.txt"));
Files.write(stopFile, "of\nthe\nby\nfor\n".getBytes(StandardCharsets.UTF_8));
AnalysisImpl analysis = new AnalysisImpl();
Map<String, String> tkParams = new HashMap<>();
tkParams.put("maxTokenLen", "128");
Map<String, String> tfParams = new HashMap<>();
tfParams.put("ignoreCase", "true");
tfParams.put("words", "stop.txt");
tfParams.put("format", "wordset");
CustomAnalyzerConfig.Builder builder = new CustomAnalyzerConfig.Builder(
"whitespace", tkParams)
.configDir(confDir.toString())
.addTokenFilterConfig("lowercase", Collections.emptyMap())
.addTokenFilterConfig("stop", tfParams);
CustomAnalyzer analyzer = (CustomAnalyzer) analysis.buildCustomAnalyzer(builder.build());
assertEquals("org.apache.lucene.analysis.custom.CustomAnalyzer", analyzer.getClass().getName());
assertEquals("org.apache.lucene.analysis.core.WhitespaceTokenizerFactory", analyzer.getTokenizerFactory().getClass().getName());
assertEquals("org.apache.lucene.analysis.core.LowerCaseFilterFactory", analyzer.getTokenFilterFactories().get(0).getClass().getName());
assertEquals("org.apache.lucene.analysis.core.StopFilterFactory", analyzer.getTokenFilterFactories().get(1).getClass().getName());
String text = "Government of the People, by the People, for the People";
List<Analysis.Token> tokens = analysis.analyze(text);
assertNotNull(tokens);
}
示例15
@Test
public void testAnalyzeStepByStep_custom() {
AnalysisImpl analysis = new AnalysisImpl();
Map<String, String> tkParams = new HashMap<>();
tkParams.put("maxTokenLen", "128");
CustomAnalyzerConfig.Builder builder = new CustomAnalyzerConfig.Builder("keyword", tkParams)
.addTokenFilterConfig("lowercase", Collections.emptyMap())
.addCharFilterConfig("htmlstrip", Collections.emptyMap());
CustomAnalyzer analyzer = (CustomAnalyzer) analysis.buildCustomAnalyzer(builder.build());
assertEquals("org.apache.lucene.analysis.custom.CustomAnalyzer", analyzer.getClass().getName());
assertEquals("org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory",
analyzer.getCharFilterFactories().get(0).getClass().getName());
assertEquals("org.apache.lucene.analysis.core.KeywordTokenizerFactory",
analyzer.getTokenizerFactory().getClass().getName());
assertEquals("org.apache.lucene.analysis.core.LowerCaseFilterFactory",
analyzer.getTokenFilterFactories().get(0).getClass().getName());
String text = "Apache Lucene";
Analysis.StepByStepResult result = analysis.analyzeStepByStep(text);
assertNotNull(result);
assertNotNull(result.getCharfilteredTexts());
assertEquals(1,result.getCharfilteredTexts().size());
assertEquals("htmlStrip", result.getCharfilteredTexts().get(0).getName());
assertNotNull(result.getNamedTokens());
assertEquals(2, result.getNamedTokens().size());
//FIXME check each namedTokensList
assertEquals("keyword", result.getNamedTokens().get(0).getName());
assertEquals("lowercase", result.getNamedTokens().get(1).getName());
}
示例16
public void testBasic() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
SENTENCES_chunks, null, null, true);
}
示例17
public void testPayloads() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("opennlpChunker", "chunkerModel", chunkerModelFile)
.addTokenFilter(TypeAsPayloadTokenFilterFactory.class)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets,
null, null, null, true, toPayloads(SENTENCES_chunks));
}
示例18
public void testBasic() throws IOException {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
}
示例19
public void testNoBreak() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.build();
assertAnalyzesTo(analyzer, NO_BREAK, NO_BREAK_terms, NO_BREAK_startOffsets, NO_BREAK_endOffsets,
null, null, null, true);
}
示例20
@Test
public void testTokenizer() throws IOException {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin", "tokenizerModel", "en-test-tokenizer.bin")
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_punc, SENTENCES_startOffsets, SENTENCES_endOffsets);
assertAnalyzesTo(analyzer, SENTENCE1, SENTENCE1_punc);
}
示例21
@Test
public void testTokenizerNoSentenceDetector() throws IOException {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", "en-test-tokenizer.bin")
.build();
});
assertTrue(expected.getMessage().contains("Configuration Error: missing parameter 'sentenceModel'"));
}
示例22
@Test
public void testTokenizerNoTokenizer() throws IOException {
IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "sentenceModel", "en-test-sent.bin")
.build();
});
assertTrue(expected.getMessage().contains("Configuration Error: missing parameter 'tokenizerModel'"));
}
示例23
public void test1SentenceDictionaryOnly() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict")
.build();
assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_dict_punc, null, null,
SENTENCE_posTags, null, null, true);
}
示例24
public void test2SentencesDictionaryOnly() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_punc, null, null,
SENTENCES_posTags, null, null, true);
}
示例25
public void test1SentenceMaxEntOnly() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCE, SENTENCE_maxent_punc, null, null,
SENTENCE_posTags, null, null, true);
}
示例26
public void test2SentencesMaxEntOnly() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("OpenNLPLemmatizer", "lemmatizerModel", lemmatizerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_maxent_punc, null, null,
SENTENCES_posTags, null, null, true);
}
示例27
public void test1SentenceDictionaryAndMaxEnt() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", "en-test-pos-maxent.bin")
.addTokenFilter("opennlplemmatizer", "dictionary", "en-test-lemmas.dict", "lemmatizerModel", lemmatizerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCE_both, SENTENCE_both_punc, null, null,
SENTENCE_both_posTags, null, null, true);
}
示例28
public void test2SentencesDictionaryAndMaxEnt() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile, "lemmatizerModel", lemmatizerModelFile)
.build();
assertAnalyzesTo(analyzer, SENTENCES_both, SENTENCES_both_punc, null, null,
SENTENCES_both_posTags, null, null, true);
}
示例29
public void testKeywordAttributeAwarenessDictionaryOnly() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter(KeywordRepeatFilterFactory.class)
.addTokenFilter("opennlplemmatizer", "dictionary", lemmatizerDictFile)
.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_dict_keep_orig_punc, null, null,
SENTENCES_keep_orig_posTags, null, null, true);
}
示例30
public void testKeywordAttributeAwarenessMaxEntOnly() throws Exception {
CustomAnalyzer analyzer = CustomAnalyzer.builder(new ClasspathResourceLoader(getClass()))
.withTokenizer("opennlp", "tokenizerModel", tokenizerModelFile, "sentenceModel", sentenceModelFile)
.addTokenFilter("opennlpPOS", "posTaggerModel", posTaggerModelFile)
.addTokenFilter(KeywordRepeatFilterFactory.class)
.addTokenFilter("opennlplemmatizer", "lemmatizerModel", lemmatizerModelFile)
.addTokenFilter(RemoveDuplicatesTokenFilterFactory.class)
.build();
assertAnalyzesTo(analyzer, SENTENCES, SENTENCES_max_ent_keep_orig_punc, null, null,
SENTENCES_keep_orig_posTags, null, null, true);
}