Java源码示例:edu.stanford.nlp.process.Tokenizer
示例1
/**
* Initializes the tokenizer to detect date columns.
*/
public void initialize() {
Properties props = new Properties();
pipeline.addAnnotator(new TokenizerAnnotator(false) {
@Override
public Tokenizer<CoreLabel> getTokenizer(Reader r) {
// TODO Auto-generated method stub
return new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), "");
}
});
pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
pipeline.addAnnotator(new POSTaggerAnnotator(false));
pipeline.addAnnotator(new TimeAnnotator("sutime", props));
}
示例2
/**
* Parses a sentence and returns the PCFG score as a confidence measure.
*
* @param sentence
* a sentence
* @return PCFG score
*/
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence)
{
if (tlp == null || parser == null)
throw new RuntimeException("Parser has not been initialized");
// parse the sentence to produce PCFG score
log.debug("Parsing sentence");
double score;
synchronized (parser)
{
Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
new StringReader(sentence));
List<Word> words = tokenizer.tokenize();
log.debug("Tokenization: " + words);
parser.parse(new Sentence(words));
score = parser.getPCFGScore();
}
return score;
}
示例3
public static void main(String args[]){
String parseModel = getResourcePath() + "englishPCFG.ser.gz";
LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parseModel);
String [] sentenceArray = {"The", "cow" ,"jumped", "over", "the", "moon", "."};
List<CoreLabel> words = SentenceUtils.toCoreLabelList(sentenceArray);
Tree parseTree = lexicalizedParser.apply(words);
parseTree.pennPrint();
TreePrint treePrint = new TreePrint("typedDependenciesCollapsed");
treePrint.printTree(parseTree);
String sentence = "The cow jumped over the moon.";
TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(sentence));
List<CoreLabel> wordList = tokenizer.tokenize();
parseTree = lexicalizedParser.apply(wordList);
TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack();
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree);
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
for(TypedDependency dependency : tdl) {
System.out.println("Governor Word: [" + dependency.gov()
+ "] Relation: [" + dependency.reln().getLongName()
+ "] Dependent Word: [" + dependency.dep() + "]");
}
}
示例4
@Override
public Sequence<IString> process(String input) {
String tokenizerInput = toUncased(input.trim());
Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new StringReader(tokenizerInput));
List<String> outputStrings = new ArrayList<>();
while (tokenizer.hasNext()) {
String string = tokenizer.next().get(TextAnnotation.class);
outputStrings.add(string);
}
Sequence<IString> rv = IStrings.toIStringSequence(outputStrings);
if(compoundSplitter != null) rv = compoundSplitter.process(rv);
return rv;
}
示例5
/**
* Parses a sentence and returns a string representation of the parse tree.
*
* @param sentence
* a sentence
* @return Tree whose Label is a MapLabel containing correct begin and end
* character offsets in keys BEGIN_KEY and END_KEY
*/
@SuppressWarnings("unchecked")
public static String parse(String sentence)
{
if (tlp == null || parser == null)
throw new RuntimeException("Parser has not been initialized");
// parse the sentence to produce stanford Tree
log.debug("Parsing sentence");
Tree tree = null;
synchronized (parser)
{
Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
new StringReader(sentence));
List<Word> words = tokenizer.tokenize();
log.debug("Tokenization: " + words);
parser.parse(new Sentence(words));
tree = parser.getBestParse();
}
// label tree with character extents
// log.debug("Setting character extents");
// updateTreeLabels(tree, tree, new MutableInteger(), new
// MutableInteger(-1));
// log.debug("Creating offset mapping");
// List<RangeMap> mapping = createMapping(sentence);
// log.debug(mapping.toString());
// log.debug("Applying offset mapping");
// mapOffsets(tree, mapping);
return tree.toString().replaceAll(" \\[[\\S]+\\]", "");
}
示例6
private static void usingStanfordLexicalizedParser() {
String parserModel = "C:/Current Books in Progress/NLP and Java/Models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parserModel);
// This option shows parsing a list of correctly tokenized words
System.out.println("---First option");
String[] senetenceArray = {"The", "cow", "jumped", "over", "the", "moon", "."};
List<CoreLabel> words = Sentence.toCoreLabelList(senetenceArray);
Tree parseTree = lexicalizedParser.apply(words);
parseTree.pennPrint();
System.out.println();
// This option shows loading and using an explicit tokenizer
System.out.println("---Second option");
String sentence = "The cow jumped over the moon.";
TokenizerFactory<CoreLabel> tokenizerFactory
= PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
Tokenizer<CoreLabel> tokenizer
= tokenizerFactory.getTokenizer(new StringReader(sentence));
List<CoreLabel> wordList = tokenizer.tokenize();
parseTree = lexicalizedParser.apply(wordList);
TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack(); // PennTreebankLanguagePack for English
GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree);
List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
System.out.println(tdl);
for (TypedDependency dependency : tdl) {
System.out.println("Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName()
+ "] Dependent Word: [" + dependency.dep() + "]");
}
System.out.println();
// You can also use a TreePrint object to print trees and dependencies
// System.out.println("---Using TreePrint");
// TreePrint treePrint = new TreePrint("penn,typedDependenciesCollapsed");
// treePrint.printTree(parseTree);
// System.out.println("TreePrint Formats");
// for (String format : TreePrint.outputTreeFormats) {
// System.out.println(format);
// }
// System.out.println();
}