Java源码示例:edu.stanford.nlp.ie.AbstractSequenceClassifier
示例1
/**
* Named Entity Classifier (Conditional Random Fields) only
*
* @param input the string to annotator
* @param hash the input hash code
* @param language the input language
*/
private Annotations processNerClassifier(String input, String hash, Language language) throws InterruptedException {
Annotations annotations = new Annotations(hash, getType(), language);
LOGGER.info("name-finding for " + language.toString());
// Recognize named entities from input
final CoreNlpAnnotator<AbstractSequenceClassifier<CoreLabel>> abstractSequenceClassifierCoreNlpAnnotator;
abstractSequenceClassifierCoreNlpAnnotator = CoreNlpNerModels.getInstance().get(language);
List<Triple<String, Integer, Integer>> items = abstractSequenceClassifierCoreNlpAnnotator.annotator.classifyToCharacterOffsets(input);
// For each recognized named entity
for (Triple<String, Integer, Integer> item : items) {
// Triple: <category, begin, end>
NamedEntity.Category category = NamedEntity.Category.parse(item.first());
int begin = item.second();
int end = item.third();
annotations.add(NER, begin, end, category);
}
return annotations;
}
示例2
/**
* Checks conversion of Stanford NER output format into
* {@link com.bericotech.clavin.resolver.ClavinLocationResolver}
* input format.
*
* @throws IOException
*/
@Test
public void testConvertNERtoCLAVIN() throws IOException {
InputStream mpis = this.getClass().getClassLoader().getResourceAsStream("models/english.all.3class.distsim.prop");
Properties mp = new Properties();
mp.load(mpis);
AbstractSequenceClassifier<CoreMap> namedEntityRecognizer =
CRFClassifier.getJarClassifier("/models/english.all.3class.distsim.crf.ser.gz", mp);
String text = "I was born in Springfield and grew up in Boston.";
List<Triple<String, Integer, Integer>> entitiesFromNER = namedEntityRecognizer.classifyToCharacterOffsets(text);
List<LocationOccurrence> locationsForCLAVIN = convertNERtoCLAVIN(entitiesFromNER, text);
assertEquals("wrong number of entities", 2, locationsForCLAVIN.size());
assertEquals("wrong text for first entity", "Springfield", locationsForCLAVIN.get(0).getText());
assertEquals("wrong position for first entity", 14, locationsForCLAVIN.get(0).getPosition());
assertEquals("wrong text for second entity", "Boston", locationsForCLAVIN.get(1).getText());
assertEquals("wrong position for second entity", 41, locationsForCLAVIN.get(1).getPosition());
}
示例3
protected AbstractSequenceClassifier<CoreLabel> initialValue() {
try {
return CRFClassifier.getClassifier(classifierFilePath);
} catch (final Exception exception) {
LOGGER.error(MessageCatalog._00052_CLASSIFIER_LOAD_FAILURE, classifierFilePath);
return NULL_OBJECT_CLASSIFIER;
}
}
示例4
@Override
AbstractSequenceClassifier<CoreLabel> classifier() {
synchronized(this) {
if (classifier == null) {
try {
classifier = CRFClassifier.getClassifier(classifierFilePath);
} catch (final Exception exception) {
LOGGER.error(MessageCatalog._00052_CLASSIFIER_LOAD_FAILURE, classifierFilePath);
classifier = NULL_OBJECT_CLASSIFIER;
}
}
return classifier;
}
}
示例5
private AbstractSequenceClassifier<CoreMap> recognizerForFiles(String NERmodel, String NERprop) throws IOException, ClassCastException, ClassNotFoundException {
InputStream mpis = this.getClass().getClassLoader().getResourceAsStream("models/" + NERprop);
Properties mp = new Properties();
mp.load(mpis);
AbstractSequenceClassifier<CoreMap> recognizer = (AbstractSequenceClassifier<CoreMap>) CRFClassifier.getClassifier("models/" + NERmodel, mp);
return recognizer;
}
示例6
public void initialize(CliffConfig config) throws ClassCastException, IOException, ClassNotFoundException{
recognizerByLanguage = new HashMap<String, AbstractSequenceClassifier<CoreMap>>();
recognizerByLanguage.put(GERMAN, recognizerForFiles("german.conll.germeval2014.hgc_175m_600.crf.ser.gz", "german-2018.hgc_175m_600.prop"));
recognizerByLanguage.put(SPANISH, recognizerForFiles("spanish.ancora.distsim.s512.crf.ser.gz", "spanish.ancora.distsim.s512.prop"));
recognizerByLanguage.put(ENGLISH, recognizerForFiles("english.all.3class.caseless.distsim.crf.ser.gz", "english.all.3class.caseless.distsim.prop"));
demonyms = new WikipediaDemonymMap();
customSubstitutions = new CustomSubstitutionMap(CUSTOM_SUBSTITUTION_FILE);
locationBlacklist = new Blacklist(LOCATION_BLACKLIST_FILE);
personToPlaceSubstitutions = new CustomSubstitutionMap(PERSON_TO_PLACE_FILE,false);
}
示例7
public static String classifyToString(List<CoreMap> sentence, DocumentReaderAndWriter<CoreMap> readerAndWriter, AbstractSequenceClassifier classif) {
PlainTextDocumentReaderAndWriter.OutputStyle outFormat =
PlainTextDocumentReaderAndWriter.OutputStyle.fromShortName("inlineXML");
DocumentReaderAndWriter<CoreMap> tmp = readerAndWriter;
readerAndWriter = new PlainTextDocumentReaderAndWriter<CoreMap>();
readerAndWriter.init(classif.flags);
StringBuilder sb = new StringBuilder();
sb.append(((PlainTextDocumentReaderAndWriter<CoreMap>) readerAndWriter).getAnswers(sentence, outFormat, true));
return sb.toString();
}
示例8
@Override
AbstractSequenceClassifier<CoreLabel> classifier() {
return classifiers.get();
}
示例9
/**
* Get extracted locations from a plain-text body.
*
* @param textToParse Text content to perform extraction on.
* @param manuallyReplaceDemonyms Can slow down performance quite a bit
* @param language What language to parse in
* @return All the entities mentioned
*/
@Override
public ExtractedEntities extractEntities(String textToParse, boolean manuallyReplaceDemonyms, String language) {
ExtractedEntities entities = new ExtractedEntities();
if (textToParse==null || textToParse.length()==0){
logger.warn("input to extractEntities was null or zero!");
return entities;
}
String text = textToParse;
if(manuallyReplaceDemonyms){ // this is a noticeable performance hit
logger.debug("Replacing all demonyms by hand");
text = demonyms.replaceAll(textToParse);
}
AbstractSequenceClassifier<CoreMap> recognizer = recognizerByLanguage.get(language);
// extract entities as <Entity Type, Start Index, Stop Index>
List<Triple<String, Integer, Integer>> extractedEntities =
recognizer.classifyToCharacterOffsets(text);
if (extractedEntities != null) {
for (Triple<String, Integer, Integer> extractedEntity : extractedEntities) {
String entityName = text.substring(extractedEntity.second(), extractedEntity.third());
int position = extractedEntity.second();
switch(extractedEntity.first){
case "PERS": // spanish
case "I-PER": // german
case "PERSON": // english
if(personToPlaceSubstitutions.contains(entityName)){
entities.addLocation( getLocationOccurrence(personToPlaceSubstitutions.getSubstitution(entityName), position) );
logger.debug("Changed person "+entityName+" to a place");
} else {
PersonOccurrence person = new PersonOccurrence(entityName, position);
entities.addPerson( person );
}
break;
case "LUG":
case "I-LOC": // german
case "LOCATION": // english
if(!locationBlacklist.contains(entityName)){
entities.addLocation( getLocationOccurrence(entityName, position) );
} else {
logger.debug("Ignored blacklisted location "+entityName);
}
break;
case "ORG": // spanish
case "I-ORG": // german
case "ORGANIZATION": // english
OrganizationOccurrence organization = new OrganizationOccurrence(entityName, position);
entities.addOrganization( organization );
break;
case "OTROS": // spanish
case "MISC": // if you're using the slower 4class model
if (demonyms.contains(entityName)) {
logger.debug("Found and adding a MISC demonym "+entityName);
entities.addLocation( getLocationOccurrence(entityName, position) );
}
break;
default:
logger.error("Unknown NER type :"+ extractedEntity.first);
}
}
}
return entities;
}
示例10
@Override
@SuppressWarnings("rawtypes")
public ExtractedEntities extractEntitiesFromSentences(Map[] sentences, boolean manuallyReplaceDemonyms, String language) {
ExtractedEntities entities = new ExtractedEntities();
if (sentences.length==0){
logger.warn("input to extractEntities was null or zero!");
return entities;
}
if(manuallyReplaceDemonyms){ // this is a noticeable performance hit
logger.debug("Replacing all demonyms by hand");
}
AbstractSequenceClassifier<CoreMap> recognizer = recognizerByLanguage.get(language);
for(Map s:sentences){
String storySentencesId = s.get("story_sentences_id").toString();
String text = s.get("sentence").toString();
if(manuallyReplaceDemonyms){ // this is a noticeable performance hit
text = demonyms.replaceAll(text);
}
// extract entities as <Entity Type, Start Index, Stop Index>
List<Triple<String, Integer, Integer>> extractedEntities =
recognizer.classifyToCharacterOffsets(text);
if (extractedEntities != null) {
for (Triple<String, Integer, Integer> extractedEntity : extractedEntities) {
String entityName = text.substring(extractedEntity.second(), extractedEntity.third());
int position = extractedEntity.second();
switch(extractedEntity.first){
case "PERSON":
if(personToPlaceSubstitutions.contains(entityName)){
entities.addLocation( getLocationOccurrence(personToPlaceSubstitutions.getSubstitution(entityName), position) );
logger.debug("Changed person "+entityName+" to a place");
} else {
PersonOccurrence person = new PersonOccurrence(entityName, position);
entities.addPerson( person );
}
break;
case "LOCATION":
if(!locationBlacklist.contains(entityName)){
LocationOccurrence loc = getLocationOccurrence(entityName, position);
// save the sentence id here
entities.addLocation( new SentenceLocationOccurrence(loc.getText(), storySentencesId) );
} else {
logger.debug("Ignored blacklisted location "+entityName);
}
break;
case "ORGANIZATION":
OrganizationOccurrence organization = new OrganizationOccurrence(entityName, position);
entities.addOrganization( organization );
break;
case "MISC": // if you're using the slower 4class model
if (demonyms.contains(entityName)) {
logger.debug("Found and adding a MISC demonym "+entityName);
entities.addLocation( getLocationOccurrence(entityName, position) );
}
break;
default:
logger.error("Unknown NER type :"+ extractedEntity.first);
}
}
}
}
return entities;
}
示例11
/**
* Sometimes, you might already be using Stanford NER elsewhere in
* your application, and you'd like to just pass the output from
* Stanford NER directly into CLAVIN, without having to re-run the
* input through Stanford NER just to use CLAVIN. This example
* shows you how to very easily do exactly that.
*
* @throws IOException
* @throws ClavinException
*/
private static void resolveStanfordEntities() throws IOException, ClavinException {
/*#####################################################################
*
* Start with Stanford NER -- no need to get CLAVIN involved for now.
*
*###################################################################*/
// instantiate Stanford NER entity extractor
InputStream mpis = WorkflowDemoNERD.class.getClassLoader().getResourceAsStream("models/english.all.3class.distsim.prop");
Properties mp = new Properties();
mp.load(mpis);
AbstractSequenceClassifier<CoreMap> namedEntityRecognizer =
CRFClassifier.getJarClassifier("/models/english.all.3class.distsim.crf.ser.gz", mp);
// Unstructured text file about Somalia to be geoparsed
File inputFile = new File("src/test/resources/sample-docs/Somalia-doc.txt");
// Grab the contents of the text file as a String
String inputString = TextUtils.fileToString(inputFile);
// extract entities from input text using Stanford NER
List<Triple<String, Integer, Integer>> entitiesFromNER = namedEntityRecognizer.classifyToCharacterOffsets(inputString);
/*#####################################################################
*
* Now, CLAVIN comes into play...
*
*###################################################################*/
// convert Stanford NER output to ClavinLocationResolver input
List<LocationOccurrence> locationsForCLAVIN = convertNERtoCLAVIN(entitiesFromNER, inputString);
// instantiate the CLAVIN location resolver
ClavinLocationResolver clavinLocationResolver = new ClavinLocationResolver(new LuceneGazetteer(new File("./IndexDirectory")));
// resolve location entities extracted from input text
List<ResolvedLocation> resolvedLocations = clavinLocationResolver.resolveLocations(locationsForCLAVIN, 1, 1, false);
// Display the ResolvedLocations found for the location names
for (ResolvedLocation resolvedLocation : resolvedLocations)
System.out.println(resolvedLocation);
}
示例12
abstract AbstractSequenceClassifier<CoreLabel> classifier();