Java源码示例:edu.stanford.nlp.ie.AbstractSequenceClassifier

示例1
/**
 * Named Entity Classifier (Conditional Random Fields) only
 *
 * @param input    the string to annotator
 * @param hash     the input hash code
 * @param language the input language
 */
private Annotations processNerClassifier(String input, String hash, Language language) throws InterruptedException {
    Annotations annotations = new Annotations(hash, getType(), language);

    LOGGER.info("name-finding for " + language.toString());
    // Recognize named entities from input
    final CoreNlpAnnotator<AbstractSequenceClassifier<CoreLabel>> abstractSequenceClassifierCoreNlpAnnotator;
    abstractSequenceClassifierCoreNlpAnnotator = CoreNlpNerModels.getInstance().get(language);
    List<Triple<String, Integer, Integer>> items = abstractSequenceClassifierCoreNlpAnnotator.annotator.classifyToCharacterOffsets(input);
    // For each recognized named entity
    for (Triple<String, Integer, Integer> item : items) {
        // Triple: <category, begin, end>
        NamedEntity.Category category = NamedEntity.Category.parse(item.first());
        int begin = item.second();
        int end = item.third();
        annotations.add(NER, begin, end, category);
    }

    return annotations;
}
 
示例2
/**
 * Checks conversion of Stanford NER output format into
 * {@link com.bericotech.clavin.resolver.ClavinLocationResolver}
 * input format.
 *
 * @throws IOException
 */
@Test
public void testConvertNERtoCLAVIN() throws IOException {
    InputStream mpis = this.getClass().getClassLoader().getResourceAsStream("models/english.all.3class.distsim.prop");
    Properties mp = new Properties();
    mp.load(mpis);
    AbstractSequenceClassifier<CoreMap> namedEntityRecognizer =
            CRFClassifier.getJarClassifier("/models/english.all.3class.distsim.crf.ser.gz", mp);

    String text = "I was born in Springfield and grew up in Boston.";
    List<Triple<String, Integer, Integer>> entitiesFromNER = namedEntityRecognizer.classifyToCharacterOffsets(text);

    List<LocationOccurrence> locationsForCLAVIN = convertNERtoCLAVIN(entitiesFromNER, text);
    assertEquals("wrong number of entities", 2, locationsForCLAVIN.size());
    assertEquals("wrong text for first entity", "Springfield", locationsForCLAVIN.get(0).getText());
    assertEquals("wrong position for first entity", 14, locationsForCLAVIN.get(0).getPosition());
    assertEquals("wrong text for second entity", "Boston", locationsForCLAVIN.get(1).getText());
    assertEquals("wrong position for second entity", 41, locationsForCLAVIN.get(1).getPosition());
}
 
示例3
protected AbstractSequenceClassifier<CoreLabel> initialValue() {
	try {
		return CRFClassifier.getClassifier(classifierFilePath);
	} catch (final Exception exception) {
		LOGGER.error(MessageCatalog._00052_CLASSIFIER_LOAD_FAILURE, classifierFilePath);
		return NULL_OBJECT_CLASSIFIER;
	}
}
 
示例4
@Override
AbstractSequenceClassifier<CoreLabel> classifier() {
	synchronized(this) {
		if (classifier == null) {
				try {
					classifier = CRFClassifier.getClassifier(classifierFilePath);
				} catch (final Exception exception) {
					LOGGER.error(MessageCatalog._00052_CLASSIFIER_LOAD_FAILURE, classifierFilePath);
					classifier = NULL_OBJECT_CLASSIFIER;
				}
		}
		return classifier;
	}
}
 
示例5
private AbstractSequenceClassifier<CoreMap> recognizerForFiles(String NERmodel, String NERprop) throws IOException, ClassCastException, ClassNotFoundException {
    InputStream mpis = this.getClass().getClassLoader().getResourceAsStream("models/" + NERprop);
    Properties mp = new Properties();
    mp.load(mpis);
    AbstractSequenceClassifier<CoreMap> recognizer = (AbstractSequenceClassifier<CoreMap>) CRFClassifier.getClassifier("models/" + NERmodel, mp);
    return recognizer;
}
 
示例6
public void initialize(CliffConfig config) throws ClassCastException, IOException, ClassNotFoundException{
	recognizerByLanguage = new HashMap<String, AbstractSequenceClassifier<CoreMap>>();
	recognizerByLanguage.put(GERMAN, recognizerForFiles("german.conll.germeval2014.hgc_175m_600.crf.ser.gz", "german-2018.hgc_175m_600.prop"));
	recognizerByLanguage.put(SPANISH, recognizerForFiles("spanish.ancora.distsim.s512.crf.ser.gz", "spanish.ancora.distsim.s512.prop"));
	recognizerByLanguage.put(ENGLISH, recognizerForFiles("english.all.3class.caseless.distsim.crf.ser.gz", "english.all.3class.caseless.distsim.prop"));
    demonyms = new WikipediaDemonymMap();
    customSubstitutions = new CustomSubstitutionMap(CUSTOM_SUBSTITUTION_FILE);
    locationBlacklist = new Blacklist(LOCATION_BLACKLIST_FILE);
    personToPlaceSubstitutions = new CustomSubstitutionMap(PERSON_TO_PLACE_FILE,false);
}
 
示例7
public static String classifyToString(List<CoreMap> sentence, DocumentReaderAndWriter<CoreMap> readerAndWriter, AbstractSequenceClassifier classif) {
  PlainTextDocumentReaderAndWriter.OutputStyle outFormat =
    PlainTextDocumentReaderAndWriter.OutputStyle.fromShortName("inlineXML");

  DocumentReaderAndWriter<CoreMap> tmp = readerAndWriter;
  readerAndWriter = new PlainTextDocumentReaderAndWriter<CoreMap>();
  readerAndWriter.init(classif.flags);

  StringBuilder sb = new StringBuilder();
  sb.append(((PlainTextDocumentReaderAndWriter<CoreMap>) readerAndWriter).getAnswers(sentence, outFormat, true));
  return sb.toString();
}
 
示例8
@Override
AbstractSequenceClassifier<CoreLabel> classifier() {
	return classifiers.get();
}
 
示例9
/**
 * Get extracted locations from a plain-text body.
 * 
 * @param textToParse                      Text content to perform extraction on.
 * @param manuallyReplaceDemonyms   Can slow down performance quite a bit
 * @param language   What language to parse in
 * @return          All the entities mentioned
 */
@Override
public ExtractedEntities extractEntities(String textToParse, boolean manuallyReplaceDemonyms, String language) {
    ExtractedEntities entities = new ExtractedEntities();

    if (textToParse==null || textToParse.length()==0){
        logger.warn("input to extractEntities was null or zero!");
        return entities; 
    }

    String text = textToParse;
    if(manuallyReplaceDemonyms){    // this is a noticeable performance hit
        logger.debug("Replacing all demonyms by hand");
        text = demonyms.replaceAll(textToParse);
    }
    
    AbstractSequenceClassifier<CoreMap> recognizer = recognizerByLanguage.get(language);
    
    // extract entities as <Entity Type, Start Index, Stop Index>
    List<Triple<String, Integer, Integer>> extractedEntities = 
    		recognizer.classifyToCharacterOffsets(text);

    if (extractedEntities != null) {
        for (Triple<String, Integer, Integer> extractedEntity : extractedEntities) {
            String entityName = text.substring(extractedEntity.second(), extractedEntity.third());
            int position = extractedEntity.second();
        	switch(extractedEntity.first){
            case "PERS":       // spanish
            case "I-PER":      // german
            case "PERSON":      // english
                if(personToPlaceSubstitutions.contains(entityName)){
                    entities.addLocation( getLocationOccurrence(personToPlaceSubstitutions.getSubstitution(entityName), position) );
                    logger.debug("Changed person "+entityName+" to a place");
                } else {
                    PersonOccurrence person = new PersonOccurrence(entityName, position);
                    entities.addPerson( person );
                }
                break;
            case "LUG":
            case "I-LOC":      // german
            case "LOCATION":    // english
                if(!locationBlacklist.contains(entityName)){
                    entities.addLocation( getLocationOccurrence(entityName, position) );
                } else {
                   logger.debug("Ignored blacklisted location "+entityName);
                }
                break;
            case "ORG":            // spanish
            case "I-ORG":          // german
            case "ORGANIZATION":    // english
                OrganizationOccurrence organization = new OrganizationOccurrence(entityName, position);
                entities.addOrganization( organization );
                break;
            case "OTROS":	// spanish
            case "MISC":    // if you're using the slower 4class model
                if (demonyms.contains(entityName)) {
                    logger.debug("Found and adding a MISC demonym "+entityName);
                    entities.addLocation( getLocationOccurrence(entityName, position) );
                }
                break;
            default:
                logger.error("Unknown NER type :"+ extractedEntity.first);
            }
        }
    }

    return entities;
}
 
示例10
@Override
@SuppressWarnings("rawtypes")
public ExtractedEntities extractEntitiesFromSentences(Map[] sentences, boolean manuallyReplaceDemonyms, String language) {
	ExtractedEntities entities = new ExtractedEntities();

    if (sentences.length==0){
        logger.warn("input to extractEntities was null or zero!");
        return entities; 
    }

    if(manuallyReplaceDemonyms){    // this is a noticeable performance hit
        logger.debug("Replacing all demonyms by hand");
    }
    
    AbstractSequenceClassifier<CoreMap> recognizer = recognizerByLanguage.get(language);
    
    for(Map s:sentences){
        String storySentencesId = s.get("story_sentences_id").toString();
        String text = s.get("sentence").toString();
        if(manuallyReplaceDemonyms){    // this is a noticeable performance hit
            text = demonyms.replaceAll(text);
        }
        // extract entities as <Entity Type, Start Index, Stop Index>
        List<Triple<String, Integer, Integer>> extractedEntities = 
            recognizer.classifyToCharacterOffsets(text);
        if (extractedEntities != null) {
            for (Triple<String, Integer, Integer> extractedEntity : extractedEntities) {
                String entityName = text.substring(extractedEntity.second(), extractedEntity.third());
                int position = extractedEntity.second();
                switch(extractedEntity.first){
                case "PERSON":
                    if(personToPlaceSubstitutions.contains(entityName)){
                        entities.addLocation( getLocationOccurrence(personToPlaceSubstitutions.getSubstitution(entityName), position) );
                        logger.debug("Changed person "+entityName+" to a place");
                    } else {
                        PersonOccurrence person = new PersonOccurrence(entityName, position);
                        entities.addPerson( person );
                    }
                    break;
                case "LOCATION":
                    if(!locationBlacklist.contains(entityName)){
                        LocationOccurrence loc = getLocationOccurrence(entityName, position);  
                        // save the sentence id here
                        entities.addLocation( new SentenceLocationOccurrence(loc.getText(), storySentencesId) );
                    } else {
                       logger.debug("Ignored blacklisted location "+entityName);
                    }
                    break;
                case "ORGANIZATION":
                    OrganizationOccurrence organization = new OrganizationOccurrence(entityName, position);
                    entities.addOrganization( organization );
                    break;
                case "MISC":    // if you're using the slower 4class model
                    if (demonyms.contains(entityName)) {
                        logger.debug("Found and adding a MISC demonym "+entityName);
                        entities.addLocation( getLocationOccurrence(entityName, position) );
                    }
                    break;
                default:
                    logger.error("Unknown NER type :"+ extractedEntity.first);
                }
            }
        }
    }

    return entities;
}
 
示例11
/**
 * Sometimes, you might already be using Stanford NER elsewhere in
 * your application, and you'd like to just pass the output from
 * Stanford NER directly into CLAVIN, without having to re-run the
 * input through Stanford NER just to use CLAVIN. This example
 * shows you how to very easily do exactly that.
 *
 * @throws IOException
 * @throws ClavinException
 */
private static void resolveStanfordEntities() throws IOException, ClavinException {

    /*#####################################################################
     *
     * Start with Stanford NER -- no need to get CLAVIN involved for now.
     *
     *###################################################################*/

    // instantiate Stanford NER entity extractor
    InputStream mpis = WorkflowDemoNERD.class.getClassLoader().getResourceAsStream("models/english.all.3class.distsim.prop");
    Properties mp = new Properties();
    mp.load(mpis);
    AbstractSequenceClassifier<CoreMap> namedEntityRecognizer =
            CRFClassifier.getJarClassifier("/models/english.all.3class.distsim.crf.ser.gz", mp);

    // Unstructured text file about Somalia to be geoparsed
    File inputFile = new File("src/test/resources/sample-docs/Somalia-doc.txt");

    // Grab the contents of the text file as a String
    String inputString = TextUtils.fileToString(inputFile);

    // extract entities from input text using Stanford NER
    List<Triple<String, Integer, Integer>> entitiesFromNER = namedEntityRecognizer.classifyToCharacterOffsets(inputString);

    /*#####################################################################
     *
     * Now, CLAVIN comes into play...
     *
     *###################################################################*/

    // convert Stanford NER output to ClavinLocationResolver input
    List<LocationOccurrence> locationsForCLAVIN = convertNERtoCLAVIN(entitiesFromNER, inputString);

    // instantiate the CLAVIN location resolver
    ClavinLocationResolver clavinLocationResolver = new ClavinLocationResolver(new LuceneGazetteer(new File("./IndexDirectory")));

    // resolve location entities extracted from input text
    List<ResolvedLocation> resolvedLocations = clavinLocationResolver.resolveLocations(locationsForCLAVIN, 1, 1, false);

    // Display the ResolvedLocations found for the location names
    for (ResolvedLocation resolvedLocation : resolvedLocations)
        System.out.println(resolvedLocation);
}
 
示例12
abstract AbstractSequenceClassifier<CoreLabel> classifier();